[SPARK-56187][PS] Fix Series.argsort null ordering for pandas 3

ueshin · HyukjinKwon · commit 81e86789f443 · 2026-03-25T07:07:58.000+09:00
### What changes were proposed in this pull request? This PR updates pandas-on-Spark `Series.argsort()` to follow the pandas 3 behavior for null values. Before this change, `Series.argsort()` always dropped nulls from the ordering step and appended `-1` for null positions. That matches pandas 2 behavior, but it no longer matches pandas 3, where nulls are ordered last and receive real positional indices. This patch keeps the existing pandas `< 3.0.0` behavior, including the deprecation warning, and switches pandas `>= 3.0.0` to sort the full Series with nulls ordered last so the returned positions match upstream pandas. ### Why are the changes needed? `pyspark.pandas.tests.series.test_arg_ops SeriesArgOpsTests.test_argsort` fails in the pandas 3 environment because pandas-on-Spark still implements the deprecated pandas 2 null-handling semantics. For example, with null values present: - pandas 3 returns positional indices for all rows, with nulls ordered last - pandas-on-Spark returned `-1` for null rows This makes `Series.argsort()` inconsistent with pandas 3 and causes the existing compatibility test to fail. ### Does this PR introduce _any_ user-facing change? Yes, it will behave more like pandas 3. ### How was this patch tested? The existing tests should pass. ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Codex (GPT-5) Closes #54989 from ueshin/issues/SPARK-56187/argsort. Authored-by: Takuya Ueshin <ueshin@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -6422,14 +6422,17 @@ def argsort(self) -> "Series":
         10    10
         dtype: int64
         """
-        warnings.warn(
-            "The behavior of Series.argsort in the presence of NA values is deprecated. "
-            "In a future version, NA values will be ordered last instead of set to -1.",
-            FutureWarning,
-        )
-        notnull = self.loc[self.notnull()]
+        if LooseVersion(pd.__version__) < "3.0.0":
+            warnings.warn(
+                "The behavior of Series.argsort in the presence of NA values is deprecated. "
+                "In a future version, NA values will be ordered last instead of set to -1.",
+                FutureWarning,
+            )
+            source = self.loc[self.notnull()]
+        else:
+            source = self
 
-        sdf_for_index = notnull._internal.spark_frame.select(notnull._internal.index_spark_columns)
+        sdf_for_index = source._internal.spark_frame.select(source._internal.index_spark_columns)
 
         tmp_join_key = verify_temp_column_name(sdf_for_index, "__tmp_join_key__")
         sdf_for_index = InternalFrame.attach_distributed_sequence_column(
@@ -6446,8 +6449,8 @@ def argsort(self) -> "Series":
         # |               4|                4|
         # +----------------+-----------------+
 
-        sdf_for_data = notnull._internal.spark_frame.select(
-            notnull.spark.column.alias("values"), NATURAL_ORDER_COLUMN_NAME
+        sdf_for_data = source._internal.spark_frame.select(
+            source.spark.column.alias("values"), NATURAL_ORDER_COLUMN_NAME
         )
         sdf_for_data = InternalFrame.attach_distributed_sequence_column(
             sdf_for_data, SPARK_DEFAULT_SERIES_NAME
@@ -6463,9 +6466,12 @@ def argsort(self) -> "Series":
         # |  4|     2|     128849018880|
         # +---+------+-----------------+
 
-        sdf_for_data = sdf_for_data.sort(
-            scol_for(sdf_for_data, "values"), NATURAL_ORDER_COLUMN_NAME
-        ).drop("values", NATURAL_ORDER_COLUMN_NAME)
+        value_scol = scol_for(sdf_for_data, "values")
+        if LooseVersion(pd.__version__) < "3.0.0":
+            sdf_for_data = sdf_for_data.sort(value_scol, NATURAL_ORDER_COLUMN_NAME)
+        else:
+            sdf_for_data = sdf_for_data.sort(value_scol.asc_nulls_last(), NATURAL_ORDER_COLUMN_NAME)
+        sdf_for_data = sdf_for_data.drop("values", NATURAL_ORDER_COLUMN_NAME)
 
         tmp_join_key = verify_temp_column_name(sdf_for_data, "__tmp_join_key__")
         sdf_for_data = InternalFrame.attach_distributed_sequence_column(sdf_for_data, tmp_join_key)
@@ -6492,10 +6498,13 @@ def argsort(self) -> "Series":
         )
         psser = first_series(DataFrame(internal))
 
-        return cast(
-            Series,
-            ps.concat([psser, self.loc[self.isnull()].spark.transform(lambda _: F.lit(-1))]),
-        )
+        if LooseVersion(pd.__version__) < "3.0.0":
+            return cast(
+                Series,
+                ps.concat([psser, self.loc[self.isnull()].spark.transform(lambda _: F.lit(-1))]),
+            )
+        else:
+            return psser
 
     def argmax(self, axis: Axis = None, skipna: bool = True) -> int:
         """