Skip to content

Commit a2198a1

Browse files
gaogaotiantianzhengruifeng
authored andcommitted
[SPARK-55108][PYTHON] Use the latest pandas-stubs for type check
### What changes were proposed in this pull request? Upgrade `pandas-stubs` version (basically do not pin it anymore) and fix all the mypy errors. ### Why are the changes needed? `pandas-stubs` sync with `pandas` - in theory we should use the same version for both. The stubs for numpy is smarter that it actually depends on a specific version. `pandas-stubs` does not do that. However, we should definitely not pin it on a 1.x version. Some of the ignore comments might because we have a low mypy version (we have to do one at a time). I did not really "fix" all typing issues - I explicitly ignored a lot of them. Some of the errors are because our type hint is wrong or inaccurate. One step at a time. I think we are moving to the right direction. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Let's check CI. ### Was this patch authored or co-authored using generative AI tooling? No Closes #53877 from gaogaotiantian/upgrade-pandas-stubs. Authored-by: Tian Gao <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 4175544 commit a2198a1

35 files changed

+128
-137
lines changed

dev/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ ruff==0.14.8
2626
mypy==1.8.0
2727
pytest-mypy-plugins==1.9.3
2828
# See SPARK-38680.
29-
pandas-stubs<1.2.0.54
29+
pandas-stubs>=2.2.0
3030
scipy-stubs; python_version>='3.10'
3131
types-PyYAML
3232

dev/spark-test-image/lint/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ RUN python3.11 -m pip install \
9494
'numpy==2.0.2' \
9595
'numpydoc' \
9696
'pandas' \
97-
'pandas-stubs==1.2.0.53' \
97+
'pandas-stubs' \
9898
'plotly>=4.8' \
9999
'pyarrow>=22.0.0' \
100100
'pytest-mypy-plugins==1.9.3' \

python/pyspark/instrumentation_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def _attach(
124124
logger_module: Union[str, ModuleType],
125125
modules: List[ModuleType],
126126
classes: List[Type[Any]],
127-
missings: List[Tuple[Type[Any], Type[Any]]],
127+
missings: List[Tuple[Union[ModuleType, Type[Any]], Type[Any]]],
128128
) -> None:
129129
if isinstance(logger_module, str):
130130
logger_module = importlib.import_module(logger_module)

python/pyspark/ml/functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def _validate_and_transform_single_input(
241241
# tensor columns
242242
if len(batch.columns) == 1:
243243
# one tensor column and one expected input, vstack rows
244-
single_input = np.vstack(batch.iloc[:, 0])
244+
single_input = np.vstack(batch.iloc[:, 0]) # type: ignore[call-overload]
245245
else:
246246
raise ValueError(
247247
"Multiple input columns found, but model expected a single "

python/pyspark/pandas/accessors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ def new_func(o: Any) -> Union[pd.DataFrame, pd.Series]:
579579
return original_func(o, *args, **kwargs)
580580

581581
def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
582-
return new_func(pdf).to_frame()
582+
return new_func(pdf).to_frame() # type: ignore[operator]
583583

584584
def pandas_series_func(
585585
f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType

python/pyspark/pandas/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
import numpy as np
2828
import pandas as pd
29-
from pandas.api.types import is_list_like, CategoricalDtype # type: ignore[attr-defined]
29+
from pandas.api.types import is_list_like, CategoricalDtype
3030

3131
from pyspark.sql import functions as F, Column, Window
3232
from pyspark.sql.types import LongType, BooleanType, NumericType

python/pyspark/pandas/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
1818

1919
import pandas as pd
20-
from pandas.api.types import ( # type: ignore[attr-defined]
20+
from pandas.api.types import (
2121
CategoricalDtype,
2222
is_dict_like,
2323
is_list_like,

python/pyspark/pandas/data_type_ops/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def _should_return_all_false(left: IndexOpsLike, right: Any) -> bool:
116116
based on incompatible dtypes: non-numeric vs. numeric (including bools).
117117
"""
118118
from pyspark.pandas.base import IndexOpsMixin
119-
from pandas.api.types import is_list_like # type: ignore[attr-defined]
119+
from pandas.api.types import is_list_like
120120

121121
def are_both_numeric(left_dtype: Dtype, right_dtype: Dtype) -> bool:
122122
return is_numeric_dtype(left_dtype) and is_numeric_dtype(right_dtype)

python/pyspark/pandas/data_type_ops/boolean_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from typing import Any, Union
2020

2121
import pandas as pd
22-
from pandas.api.types import CategoricalDtype, is_integer_dtype # type: ignore[attr-defined]
22+
from pandas.api.types import CategoricalDtype, is_integer_dtype
2323
from pandas.core.dtypes.common import is_numeric_dtype
2424

2525
from pyspark.pandas.base import column_op, IndexOpsMixin

python/pyspark/pandas/data_type_ops/categorical_ops.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616
#
1717

1818
from itertools import chain
19-
from typing import cast, Any, Union
19+
from typing import cast, Any, Sequence, Union
2020

2121
import pandas as pd
2222
import numpy as np
23-
from pandas.api.types import is_list_like, CategoricalDtype # type: ignore[attr-defined]
23+
from pandas.api.types import is_list_like, CategoricalDtype
2424

2525
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
2626
from pyspark.pandas.base import IndexOpsMixin
@@ -43,7 +43,7 @@ def restore(self, col: pd.Series) -> pd.Series:
4343
"""Restore column when to_pandas."""
4444
return pd.Series(
4545
pd.Categorical.from_codes(
46-
col.replace(np.nan, -1).astype(int),
46+
cast(Sequence[int], col.replace(np.nan, -1).astype(int)),
4747
categories=cast(CategoricalDtype, self.dtype).categories,
4848
ordered=cast(CategoricalDtype, self.dtype).ordered,
4949
)

0 commit comments

Comments
 (0)