diff --git a/markdown_frames/spark_dataframe.py b/markdown_frames/spark_dataframe.py index 6fe11fd..c4ed264 100644 --- a/markdown_frames/spark_dataframe.py +++ b/markdown_frames/spark_dataframe.py @@ -25,7 +25,8 @@ make_table, get_column_names_types, get_data_from_table, - get_python_type + get_python_type, + get_array_inside_type ) from markdown_frames.type_definitions import ( STRING, @@ -138,6 +139,6 @@ def _array_type(column_type: str) -> ArrayType: column_type. :returns: ArrayType """ - inside = column_type[6:-1].strip() + inside = get_array_inside_type(column_type) return ArrayType(_types_mapping(inside)) diff --git a/markdown_frames/utils.py b/markdown_frames/utils.py index 198b04c..cc2a05e 100644 --- a/markdown_frames/utils.py +++ b/markdown_frames/utils.py @@ -10,6 +10,7 @@ from typing import List, Any, Optional from datetime import datetime from ast import literal_eval +import re from markdown_frames.type_definitions import ( NULL, @@ -94,3 +95,16 @@ def get_python_type(value_type: List[str]) -> Optional[Any]: return literal_eval(value) else: return None + +def get_array_inside_type(column_type: str) -> str: + """ + Given column_type string, extract + array inside pattern using regex. + :param column_type: string description of + column_type. + :returns: Str + """ + matchObj = re.match("array\<(.*)\>", column_type) + if matchObj: + return matchObj.group(1).strip() + return None \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 91575af..f3d69e0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,6 +8,7 @@ get_column_names_types, get_data_from_table, get_python_type, + get_array_inside_type ) @@ -217,3 +218,24 @@ def test_get_python_type(): assert output6 == expected6 assert output7 == expected7 assert output8 == expected8 + +def test_get_array_type(): + """ + Test fucntion that given array type string pattern + extract inside type + """ + input1 = "array" + input2 = "array" + input3 = "array>" + + expected1 = "int" + expected2 = "int" + expected3 = "array" + + output1 = get_array_inside_type(input1) + output2 = get_array_inside_type(input2) + output3 = get_array_inside_type(input3) + + assert output1 == expected1 + assert output2 == expected2 + assert output3 == expected3 \ No newline at end of file