Skip to content

Commit 03dedbb

Browse files
zhengruifengHyukjinKwon
authored andcommitted
[SPARK-55164][PYTHON][TESTS] Refactor tests for python udf input type coercion
### What changes were proposed in this pull request? Refactor tests for python udf input type coercion ### Why are the changes needed? to save/load golden with pandas ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #53947 from zhengruifeng/refactor_py_udf_input. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
1 parent f7ce07a commit 03dedbb

12 files changed

+635
-226
lines changed

dev/sparktestsupport/modules.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,7 @@ def __hash__(self):
595595
"pyspark.sql.tests.plot.test_frame_plot_plotly",
596596
"pyspark.sql.tests.test_connect_compatibility",
597597
"pyspark.sql.tests.udf_type_tests.test_udf_input_types",
598+
"pyspark.sql.tests.coercion.test_python_udf_input_type",
598599
"pyspark.sql.tests.coercion.test_pandas_udf_return_type",
599600
"pyspark.sql.tests.coercion.test_python_udf_return_type",
600601
],
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
Test Case Spark Type Spark Value Python Type Python Value
2+
0 byte_values tinyint [-128, 127, 0] ['int', 'int', 'int'] ['-128', '127', '0']
3+
1 byte_null tinyint [None, 42] ['NoneType', 'int'] ['None', '42']
4+
2 short_values smallint [-32768, 32767, 0] ['int', 'int', 'int'] ['-32768', '32767', '0']
5+
3 short_null smallint [None, 123] ['NoneType', 'int'] ['None', '123']
6+
4 int_values int [-2147483648, 2147483647, 0] ['int', 'int', 'int'] ['-2147483648', '2147483647', '0']
7+
5 int_null int [None, 456] ['NoneType', 'int'] ['None', '456']
8+
6 long_values bigint [-9223372036854775808, 9223372036854775807, 0] ['int', 'int', 'int'] ['-9223372036854775808', '9223372036854775807', '0']
9+
7 long_null bigint [None, 789] ['NoneType', 'int'] ['None', '789']
10+
8 float_values float [0.0, 1.0, 3.140000104904175] ['float', 'float', 'float'] ['0.0', '1.0', '3.140000104904175']
11+
9 float_null float [None, 3.140000104904175] ['NoneType', 'float'] ['None', '3.140000104904175']
12+
10 double_values double [0.0, 1.0, 0.3333333333333333] ['float', 'float', 'float'] ['0.0', '1.0', '0.3333333333333333']
13+
11 double_null double [None, 2.71] ['NoneType', 'float'] ['None', '2.71']
14+
12 decimal_values decimal(3,2) [Decimal('5.35'), Decimal('1.23')] ['Decimal', 'Decimal'] ['5.35', '1.23']
15+
13 decimal_null decimal(3,2) [None, Decimal('9.99')] ['NoneType', 'Decimal'] ['None', '9.99']
16+
14 string_values string ['abc', '', 'hello'] ['str', 'str', 'str'] ['abc', '', 'hello']
17+
15 string_null string [None, 'test'] ['NoneType', 'str'] ['None', 'test']
18+
16 binary_values binary [b'abc', b'', b'ABC'] ['bytes', 'bytes', 'bytes'] "[""b'abc'"", ""b''"", ""b'ABC'""]"
19+
17 binary_null binary [None, b'test'] ['NoneType', 'bytes'] "['None', ""b'test'""]"
20+
18 boolean_values boolean [True, False] ['bool', 'bool'] ['True', 'False']
21+
19 boolean_null boolean [None, True] ['NoneType', 'bool'] ['None', 'True']
22+
20 date_values date [datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)] ['date', 'date'] ['2020-02-02', '1970-01-01']
23+
21 date_null date [None, datetime.date(2023, 1, 1)] ['NoneType', 'date'] ['None', '2023-01-01']
24+
22 timestamp_values timestamp [datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)] ['datetime'] ['2020-02-02 12:15:16.123000']
25+
23 timestamp_null timestamp [None, datetime.datetime(2023, 1, 1, 12, 0)] ['NoneType', 'datetime'] ['None', '2023-01-01 12:00:00']
26+
24 array_int_values array<int> [[1, 2, 3], [], [1, None, 3]] ['list', 'list', 'list'] ['[1, 2, 3]', '[]', '[1, None, 3]']
27+
25 array_int_null array<int> [None, [4, 5, 6]] ['NoneType', 'list'] ['None', '[4, 5, 6]']
28+
26 map_str_int_values map<string,int> [{'world': 2, 'hello': 1}, {}] ['dict', 'dict'] "[""{'world': 2, 'hello': 1}"", '{}']"
29+
27 map_str_int_null map<string,int> [None, {'test': 123}] ['NoneType', 'dict'] "['None', ""{'test': 123}""]"
30+
28 struct_int_str_values struct<a1:int,a2:string> [Row(a1=1, a2='hello'), Row(a1=2, a2='world')] ['Row', 'Row'] "[""Row(a1=1, a2='hello')"", ""Row(a1=2, a2='world')""]"
31+
29 struct_int_str_null struct<a1:int,a2:string> [None, Row(a1=99, a2='test')] ['NoneType', 'Row'] "['None', ""Row(a1=99, a2='test')""]"
32+
30 array_array_int array<array<int>> [[[1, 2, 3]], [[1], [2, 3]]] ['list', 'list'] ['[[1, 2, 3]]', '[[1], [2, 3]]']
33+
31 array_map_str_int array<map<string,int>> [[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]] ['list', 'list'] "[""[{'world': 2, 'hello': 1}]"", ""[{'a': 1}, {'b': 2}]""]"
34+
32 array_struct_int_str array<struct<a1:int,a2:string>> [[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] ['list', 'list'] "[""[Row(a1=1, a2='hello')]"", ""[Row(a1=1, a2='hello'), Row(a1=2, a2='world')]""]"
35+
33 map_int_array_int map<int,array<int>> [{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}] ['dict', 'dict'] ['{1: [1, 2, 3]}', '{1: [1], 2: [2, 3]}']
36+
34 map_int_map_str_int map<int,map<string,int>> [{1: {'world': 2, 'hello': 1}}] ['dict'] "[""{1: {'world': 2, 'hello': 1}}""]"
37+
35 map_int_struct_int_str map<int,struct<a1:int,a2:string>> [{1: Row(a1=1, a2='hello')}] ['dict'] "[""{1: Row(a1=1, a2='hello')}""]"
38+
36 struct_int_array_int struct<a:int,b:array<int>> [Row(a=1, b=[1, 2, 3])] ['Row'] ['Row(a=1, b=[1, 2, 3])']
39+
37 struct_int_map_str_int struct<a:int,b:map<string,int>> [Row(a=1, b={'world': 2, 'hello': 1})] ['Row'] "[""Row(a=1, b={'world': 2, 'hello': 1})""]"
40+
38 struct_int_struct_int_str struct<a:int,b:struct<a1:int,a2:string>> [Row(a=1, b=Row(a1=1, a2='hello'))] ['Row'] "[""Row(a=1, b=Row(a1=1, a2='hello'))""]"
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
| | Test Case | Spark Type | Spark Value | Python Type | Python Value |
2+
|----|---------------------------|------------------------------------------|---------------------------------------------------------------------------|-----------------------------|-------------------------------------------------------------------------------|
3+
| 0 | byte_values | tinyint | [-128, 127, 0] | ['int', 'int', 'int'] | ['-128', '127', '0'] |
4+
| 1 | byte_null | tinyint | [None, 42] | ['NoneType', 'int'] | ['None', '42'] |
5+
| 2 | short_values | smallint | [-32768, 32767, 0] | ['int', 'int', 'int'] | ['-32768', '32767', '0'] |
6+
| 3 | short_null | smallint | [None, 123] | ['NoneType', 'int'] | ['None', '123'] |
7+
| 4 | int_values | int | [-2147483648, 2147483647, 0] | ['int', 'int', 'int'] | ['-2147483648', '2147483647', '0'] |
8+
| 5 | int_null | int | [None, 456] | ['NoneType', 'int'] | ['None', '456'] |
9+
| 6 | long_values | bigint | [-9223372036854775808, 9223372036854775807, 0] | ['int', 'int', 'int'] | ['-9223372036854775808', '9223372036854775807', '0'] |
10+
| 7 | long_null | bigint | [None, 789] | ['NoneType', 'int'] | ['None', '789'] |
11+
| 8 | float_values | float | [0.0, 1.0, 3.140000104904175] | ['float', 'float', 'float'] | ['0.0', '1.0', '3.140000104904175'] |
12+
| 9 | float_null | float | [None, 3.140000104904175] | ['NoneType', 'float'] | ['None', '3.140000104904175'] |
13+
| 10 | double_values | double | [0.0, 1.0, 0.3333333333333333] | ['float', 'float', 'float'] | ['0.0', '1.0', '0.3333333333333333'] |
14+
| 11 | double_null | double | [None, 2.71] | ['NoneType', 'float'] | ['None', '2.71'] |
15+
| 12 | decimal_values | decimal(3,2) | [Decimal('5.35'), Decimal('1.23')] | ['Decimal', 'Decimal'] | ['5.35', '1.23'] |
16+
| 13 | decimal_null | decimal(3,2) | [None, Decimal('9.99')] | ['NoneType', 'Decimal'] | ['None', '9.99'] |
17+
| 14 | string_values | string | ['abc', '', 'hello'] | ['str', 'str', 'str'] | ['abc', '', 'hello'] |
18+
| 15 | string_null | string | [None, 'test'] | ['NoneType', 'str'] | ['None', 'test'] |
19+
| 16 | binary_values | binary | [b'abc', b'', b'ABC'] | ['bytes', 'bytes', 'bytes'] | ["b'abc'", "b''", "b'ABC'"] |
20+
| 17 | binary_null | binary | [None, b'test'] | ['NoneType', 'bytes'] | ['None', "b'test'"] |
21+
| 18 | boolean_values | boolean | [True, False] | ['bool', 'bool'] | ['True', 'False'] |
22+
| 19 | boolean_null | boolean | [None, True] | ['NoneType', 'bool'] | ['None', 'True'] |
23+
| 20 | date_values | date | [datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)] | ['date', 'date'] | ['2020-02-02', '1970-01-01'] |
24+
| 21 | date_null | date | [None, datetime.date(2023, 1, 1)] | ['NoneType', 'date'] | ['None', '2023-01-01'] |
25+
| 22 | timestamp_values | timestamp | [datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)] | ['datetime'] | ['2020-02-02 12:15:16.123000'] |
26+
| 23 | timestamp_null | timestamp | [None, datetime.datetime(2023, 1, 1, 12, 0)] | ['NoneType', 'datetime'] | ['None', '2023-01-01 12:00:00'] |
27+
| 24 | array_int_values | array<int> | [[1, 2, 3], [], [1, None, 3]] | ['list', 'list', 'list'] | ['[1, 2, 3]', '[]', '[1, None, 3]'] |
28+
| 25 | array_int_null | array<int> | [None, [4, 5, 6]] | ['NoneType', 'list'] | ['None', '[4, 5, 6]'] |
29+
| 26 | map_str_int_values | map<string,int> | [{'world': 2, 'hello': 1}, {}] | ['dict', 'dict'] | ["{'world': 2, 'hello': 1}", '{}'] |
30+
| 27 | map_str_int_null | map<string,int> | [None, {'test': 123}] | ['NoneType', 'dict'] | ['None', "{'test': 123}"] |
31+
| 28 | struct_int_str_values | struct<a1:int,a2:string> | [Row(a1=1, a2='hello'), Row(a1=2, a2='world')] | ['Row', 'Row'] | ["Row(a1=1, a2='hello')", "Row(a1=2, a2='world')"] |
32+
| 29 | struct_int_str_null | struct<a1:int,a2:string> | [None, Row(a1=99, a2='test')] | ['NoneType', 'Row'] | ['None', "Row(a1=99, a2='test')"] |
33+
| 30 | array_array_int | array<array<int>> | [[[1, 2, 3]], [[1], [2, 3]]] | ['list', 'list'] | ['[[1, 2, 3]]', '[[1], [2, 3]]'] |
34+
| 31 | array_map_str_int | array<map<string,int>> | [[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]] | ['list', 'list'] | ["[{'world': 2, 'hello': 1}]", "[{'a': 1}, {'b': 2}]"] |
35+
| 32 | array_struct_int_str | array<struct<a1:int,a2:string>> | [[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] | ['list', 'list'] | ["[Row(a1=1, a2='hello')]", "[Row(a1=1, a2='hello'), Row(a1=2, a2='world')]"] |
36+
| 33 | map_int_array_int | map<int,array<int>> | [{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}] | ['dict', 'dict'] | ['{1: [1, 2, 3]}', '{1: [1], 2: [2, 3]}'] |
37+
| 34 | map_int_map_str_int | map<int,map<string,int>> | [{1: {'world': 2, 'hello': 1}}] | ['dict'] | ["{1: {'world': 2, 'hello': 1}}"] |
38+
| 35 | map_int_struct_int_str | map<int,struct<a1:int,a2:string>> | [{1: Row(a1=1, a2='hello')}] | ['dict'] | ["{1: Row(a1=1, a2='hello')}"] |
39+
| 36 | struct_int_array_int | struct<a:int,b:array<int>> | [Row(a=1, b=[1, 2, 3])] | ['Row'] | ['Row(a=1, b=[1, 2, 3])'] |
40+
| 37 | struct_int_map_str_int | struct<a:int,b:map<string,int>> | [Row(a=1, b={'world': 2, 'hello': 1})] | ['Row'] | ["Row(a=1, b={'world': 2, 'hello': 1})"] |
41+
| 38 | struct_int_struct_int_str | struct<a:int,b:struct<a1:int,a2:string>> | [Row(a=1, b=Row(a1=1, a2='hello'))] | ['Row'] | ["Row(a=1, b=Row(a1=1, a2='hello'))"] |
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
Test Case Spark Type Spark Value Python Type Python Value
2+
0 byte_values tinyint [-128, 127, 0] ['int', 'int', 'int'] ['-128', '127', '0']
3+
1 byte_null tinyint [None, 42] ['NoneType', 'int'] ['None', '42']
4+
2 short_values smallint [-32768, 32767, 0] ['int', 'int', 'int'] ['-32768', '32767', '0']
5+
3 short_null smallint [None, 123] ['NoneType', 'int'] ['None', '123']
6+
4 int_values int [-2147483648, 2147483647, 0] ['int', 'int', 'int'] ['-2147483648', '2147483647', '0']
7+
5 int_null int [None, 456] ['NoneType', 'int'] ['None', '456']
8+
6 long_values bigint [-9223372036854775808, 9223372036854775807, 0] ['int', 'int', 'int'] ['-9223372036854775808', '9223372036854775807', '0']
9+
7 long_null bigint [None, 789] ['NoneType', 'int'] ['None', '789']
10+
8 float_values float [0.0, 1.0, 3.140000104904175] ['float', 'float', 'float'] ['0.0', '1.0', '3.140000104904175']
11+
9 float_null float [None, 3.140000104904175] ['NoneType', 'float'] ['None', '3.140000104904175']
12+
10 double_values double [0.0, 1.0, 0.3333333333333333] ['float', 'float', 'float'] ['0.0', '1.0', '0.3333333333333333']
13+
11 double_null double [None, 2.71] ['NoneType', 'float'] ['None', '2.71']
14+
12 decimal_values decimal(3,2) [Decimal('5.35'), Decimal('1.23')] ['Decimal', 'Decimal'] ['5.35', '1.23']
15+
13 decimal_null decimal(3,2) [None, Decimal('9.99')] ['NoneType', 'Decimal'] ['None', '9.99']
16+
14 string_values string ['abc', '', 'hello'] ['str', 'str', 'str'] ['abc', '', 'hello']
17+
15 string_null string [None, 'test'] ['NoneType', 'str'] ['None', 'test']
18+
16 binary_values binary [b'abc', b'', b'ABC'] ['bytes', 'bytes', 'bytes'] "[""b'abc'"", ""b''"", ""b'ABC'""]"
19+
17 binary_null binary [None, b'test'] ['NoneType', 'bytes'] "['None', ""b'test'""]"
20+
18 boolean_values boolean [True, False] ['bool', 'bool'] ['True', 'False']
21+
19 boolean_null boolean [None, True] ['NoneType', 'bool'] ['None', 'True']
22+
20 date_values date [datetime.date(2020, 2, 2), datetime.date(1970, 1, 1)] ['date', 'date'] ['2020-02-02', '1970-01-01']
23+
21 date_null date [None, datetime.date(2023, 1, 1)] ['NoneType', 'date'] ['None', '2023-01-01']
24+
22 timestamp_values timestamp [datetime.datetime(2020, 2, 2, 12, 15, 16, 123000)] ['datetime'] ['2020-02-02 12:15:16.123000']
25+
23 timestamp_null timestamp [None, datetime.datetime(2023, 1, 1, 12, 0)] ['NoneType', 'datetime'] ['None', '2023-01-01 12:00:00']
26+
24 array_int_values array<int> [[1, 2, 3], [], [1, None, 3]] ['list', 'list', 'list'] ['[1, 2, 3]', '[]', '[1, None, 3]']
27+
25 array_int_null array<int> [None, [4, 5, 6]] ['NoneType', 'list'] ['None', '[4, 5, 6]']
28+
26 map_str_int_values map<string,int> [{'world': 2, 'hello': 1}, {}] ['dict', 'dict'] "[""{'world': 2, 'hello': 1}"", '{}']"
29+
27 map_str_int_null map<string,int> [None, {'test': 123}] ['NoneType', 'dict'] "['None', ""{'test': 123}""]"
30+
28 struct_int_str_values struct<a1:int,a2:string> [Row(a1=1, a2='hello'), Row(a1=2, a2='world')] ['Row', 'Row'] "[""Row(a1=1, a2='hello')"", ""Row(a1=2, a2='world')""]"
31+
29 struct_int_str_null struct<a1:int,a2:string> [None, Row(a1=99, a2='test')] ['NoneType', 'Row'] "['None', ""Row(a1=99, a2='test')""]"
32+
30 array_array_int array<array<int>> [[[1, 2, 3]], [[1], [2, 3]]] ['list', 'list'] ['[[1, 2, 3]]', '[[1], [2, 3]]']
33+
31 array_map_str_int array<map<string,int>> [[{'world': 2, 'hello': 1}], [{'a': 1}, {'b': 2}]] ['list', 'list'] "[""[{'world': 2, 'hello': 1}]"", ""[{'a': 1}, {'b': 2}]""]"
34+
32 array_struct_int_str array<struct<a1:int,a2:string>> [[Row(a1=1, a2='hello')], [Row(a1=1, a2='hello'), Row(a1=2, a2='world')]] ['list', 'list'] "[""[Row(a1=1, a2='hello')]"", ""[Row(a1=1, a2='hello'), Row(a1=2, a2='world')]""]"
35+
33 map_int_array_int map<int,array<int>> [{1: [1, 2, 3]}, {1: [1], 2: [2, 3]}] ['dict', 'dict'] ['{1: [1, 2, 3]}', '{1: [1], 2: [2, 3]}']
36+
34 map_int_map_str_int map<int,map<string,int>> [{1: {'world': 2, 'hello': 1}}] ['dict'] "[""{1: {'world': 2, 'hello': 1}}""]"
37+
35 map_int_struct_int_str map<int,struct<a1:int,a2:string>> [{1: Row(a1=1, a2='hello')}] ['dict'] "[""{1: Row(a1=1, a2='hello')}""]"
38+
36 struct_int_array_int struct<a:int,b:array<int>> [Row(a=1, b=[1, 2, 3])] ['Row'] ['Row(a=1, b=[1, 2, 3])']
39+
37 struct_int_map_str_int struct<a:int,b:map<string,int>> [Row(a=1, b={'world': 2, 'hello': 1})] ['Row'] "[""Row(a=1, b={'world': 2, 'hello': 1})""]"
40+
38 struct_int_struct_int_str struct<a:int,b:struct<a1:int,a2:string>> [Row(a=1, b=Row(a1=1, a2='hello'))] ['Row'] "[""Row(a=1, b=Row(a1=1, a2='hello'))""]"

0 commit comments

Comments
 (0)