apache · gaogaotiantian · Jan 23, 2026 · Jan 31, 2026 · Feb 3, 2026 · Feb 4, 2026
diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile
@@ -60,7 +60,7 @@ RUN python3.10 -m pip install --ignore-installed 'blinker>=1.6.2' && \
 RUN python3.10 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' \
     sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
     ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' pyarrow pandas \
-    'plotly>=4.8' 'docutils<0.18.0' 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' \
+    'plotly>=4.8' 'docutils<0.18.0' 'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' \
     'pytest-mypy-plugins==1.9.3' 'black==23.12.1' 'pandas-stubs==1.2.0.53' \
     'grpcio==1.76.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
     'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' \

diff --git a/dev/lint-python b/dev/lint-python
@@ -20,7 +20,7 @@ FLAKE8_BUILD="flake8"
 MINIMUM_FLAKE8="3.9.0"
 RUFF_BUILD="ruff"
 MINIMUM_RUFF="0.14.0"
-MINIMUM_MYPY="1.8.0"
+MINIMUM_MYPY="1.19.1"
 MYPY_BUILD="mypy"
 PYTEST_BUILD="pytest"
 

diff --git a/dev/requirements.txt b/dev/requirements.txt
@@ -23,7 +23,7 @@ coverage
 
 # Linter
 ruff==0.14.8
-mypy==1.8.0
+mypy==1.19.1
 pytest-mypy-plugins==1.9.3
 # See SPARK-38680.
 pandas-stubs>=2.2.0

diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile
@@ -90,7 +90,7 @@ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
 # See 'docutils<0.18.0' in SPARK-39421
 RUN python3.11 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
   ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' pyarrow 'pandas==2.3.3' 'plotly>=4.8' 'docutils<0.18.0' \
-  'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \
+  'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \
   'pandas-stubs==1.2.0.53' 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
   'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' \
   && python3.11 -m pip cache purge
diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile
@@ -90,7 +90,7 @@ RUN python3.12 -m pip install \
     'ipython_genutils' \
     'jinja2' \
     'matplotlib' \
-    'mypy==1.8.0' \
+    'mypy==1.19.1' \
     'numpy==2.4.1' \
     'numpydoc' \
     'pandas' \

diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
@@ -65,7 +65,7 @@
         rid, text, prob, prediction = row
         print(
             "(%d, %s) --> prob=%s, prediction=%f" % (
-                rid, text, str(prob), prediction   # type: ignore
+                rid, text, str(prob), prediction
             )
         )
     # $example off$

diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py
@@ -35,7 +35,7 @@
     lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
     sortedCount: RDD[Tuple[int, int]] = lines.flatMap(lambda x: x.split(' ')) \
         .map(lambda x: (int(x), 1)) \
-        .sortByKey()
+        .sortByKey()  # type: ignore[call-overload]
     # This is just a demo on how to bring all the sorted data back to a single node.
     # In reality, we wouldn't want to collect all the data to the driver node.
     output = sortedCount.collect()

diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py
@@ -78,7 +78,7 @@ def line_to_tuple(line: str) -> Tuple[str, str]:
         .transform(lambda rdd: word_sentiments.join(rdd)) \
         .map(lambda word_tuples: (word_tuples[0], float(word_tuples[1][0]) * word_tuples[1][1])) \
         .map(lambda word_happiness: (word_happiness[1], word_happiness[0])) \
-        .transform(lambda rdd: rdd.sortByKey(False))
+        .transform(lambda rdd: rdd.sortByKey(False))  # type: ignore[call-overload]
 
     happiest_words.foreachRDD(print_happiest_words)
 

diff --git a/python/pyspark/core/context.py b/python/pyspark/core/context.py
@@ -834,8 +834,8 @@ def parallelize(self, c: Iterable[T], numSlices: Optional[int] = None) -> RDD[T]
             size = len(c)
             if size == 0:
                 return self.parallelize([], numSlices)
-            step = c[1] - c[0] if size > 1 else 1  # type: ignore[index]
-            start0 = c[0]  # type: ignore[index]
+            step = c[1] - c[0] if size > 1 else 1
+            start0 = c[0]
 
             def getStart(split: int) -> int:
                 assert numSlices is not None

diff --git a/python/pyspark/core/rdd.py b/python/pyspark/core/rdd.py
@@ -1664,7 +1664,7 @@ def func(it: Iterable[T]) -> Iterable[Any]:
             # a generator, so it could return an iterator that we need to
             # go through. We check the common case first, then deal with
             # the undocumented behavior.
-            r = f(it)
+            r = f(it)  # type: ignore[func-returns-value]
             if r is None:
                 return iter([])
             try:

diff --git a/python/pyspark/errors/exceptions/connect.py b/python/pyspark/errors/exceptions/connect.py
@@ -150,7 +150,7 @@ def _convert_exception(
                 sql_state=sql_state,
                 server_stacktrace=stacktrace,
                 display_server_stacktrace=display_server_stacktrace,
-                contexts=contexts,
+                contexts=contexts,  # type: ignore[arg-type]
                 grpc_status_code=grpc_status_code,
                 breaking_change_info=breaking_change_info,
             )
@@ -164,7 +164,7 @@ def _convert_exception(
         sql_state=sql_state,
         server_stacktrace=stacktrace,
         display_server_stacktrace=display_server_stacktrace,
-        contexts=contexts,
+        contexts=contexts,  # type: ignore[arg-type]
         grpc_status_code=grpc_status_code,
         breaking_change_info=breaking_change_info,
     )

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -2253,7 +2253,7 @@ def setMinWeightFractionPerNode(self, value: float) -> "RandomForestClassifier":
         return self._set(minWeightFractionPerNode=value)
 
 
-class RandomForestClassificationModel(
+class RandomForestClassificationModel(  # type: ignore[misc]
     _TreeEnsembleModel,
     _JavaProbabilisticClassificationModel[Vector],
     _RandomForestClassifierParams,

diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
@@ -404,7 +404,7 @@ def dot(self, other: Iterable[float]) -> np.float64:
             elif isinstance(other, Vector):
                 return np.dot(self.toArray(), other.toArray())
             else:
-                return np.dot(self.toArray(), other)
+                return np.dot(self.toArray(), other)  # type: ignore[call-overload]
 
     def squared_distance(self, other: Iterable[float]) -> np.float64:
         """

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
@@ -1585,7 +1585,7 @@ def setMinWeightFractionPerNode(self, value: float) -> "RandomForestRegressor":
         return self._set(minWeightFractionPerNode=value)
 
 
-class RandomForestRegressionModel(
+class RandomForestRegressionModel(  # type: ignore[misc]
     _JavaRegressionModel[Vector],
     _TreeEnsembleModel,
     _RandomForestRegressorParams,

diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
@@ -77,13 +77,13 @@
 
 
 def _parallelFitTasks(
-    est: Estimator,
+    est: Estimator[Transformer],
     train: DataFrame,
     eva: Evaluator,
     validation: DataFrame,
     epm: Sequence["ParamMap"],
     collectSubModel: bool,
-) -> List[Callable[[], Tuple[int, float, Transformer]]]:
+) -> List[Callable[[], Tuple[int, float, Union[Transformer, None]]]]:
     """
     Creates a list of callables which can be called from different threads to fit and evaluate
     an estimator in parallel. Each callable returns an `(index, metric)` pair.
@@ -110,7 +110,7 @@ def _parallelFitTasks(
     """
     modelIter = est.fitMultiple(train, epm)
 
-    def singleTask() -> Tuple[int, float, Transformer]:
+    def singleTask() -> Tuple[int, float, Union[Transformer, None]]:
         index, model = next(modelIter)
         # TODO: duplicate evaluator to take extra params from input
         #  Note: Supporting tuning params in evaluator need update method

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
@@ -457,7 +457,7 @@ def dot(self, other: "VectorLike") -> np.float64:
             elif isinstance(other, Vector):
                 return np.dot(self.toArray(), other.toArray())
             else:
-                return np.dot(self.toArray(), cast("ArrayLike", other))  # type: ignore[valid-type]
+                return np.dot(self.toArray(), cast("ArrayLike", other))
 
     def squared_distance(self, other: "VectorLike") -> np.float64:
         """

diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -336,7 +336,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
             return index_ops._with_new_scol(
                 scol,
                 field=index_ops._internal.data_fields[0].copy(
-                    dtype=dtype, spark_type=spark_type, nullable=nullable  # type: ignore[arg-type]
+                    dtype=dtype, spark_type=spark_type, nullable=nullable
                 ),
             )
         else:

diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -623,9 +623,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
                 ).otherwise(index_ops.spark.column.cast(spark_type))
             return index_ops._with_new_scol(
                 scol.alias(index_ops._internal.data_spark_column_names[0]),
-                field=index_ops._internal.data_fields[0].copy(
-                    dtype=dtype, spark_type=spark_type  # type: ignore[arg-type]
-                ),
+                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
             )
         elif isinstance(spark_type, StringType):
             return _as_string_type(index_ops, dtype, null_str=str(np.nan))
@@ -763,9 +761,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
                 ).otherwise(index_ops.spark.column.cast(spark_type))
             return index_ops._with_new_scol(
                 scol.alias(index_ops._internal.data_spark_column_names[0]),
-                field=index_ops._internal.data_fields[0].copy(
-                    dtype=dtype, spark_type=spark_type  # type: ignore[arg-type]
-                ),
+                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
             )
         elif isinstance(spark_type, StringType):
             return _as_string_type(index_ops, dtype, null_str=str(np.nan))

diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py
@@ -132,9 +132,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
                 )
             return index_ops._with_new_scol(
                 scol,
-                field=index_ops._internal.data_fields[0].copy(
-                    dtype=dtype, spark_type=spark_type  # type: ignore[arg-type]
-                ),
+                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
             )
         elif isinstance(spark_type, StringType):
             null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -6433,7 +6433,7 @@ def replace(
             def op(psser: ps.Series) -> ps.Series:
                 if psser.name in to_replace_dict:
                     return psser.replace(
-                        to_replace=to_replace_dict[psser.name], value=value, regex=regex
+                        to_replace=to_replace_dict[psser.name], value=value, regex=regex  # type: ignore[arg-type]
                     )
                 else:
                     return psser

diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
@@ -1956,7 +1956,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S
         )
 
         if LooseVersion(pd.__version__) < "3.0.0":
-            from pandas.core.common import is_builtin_func  # type: ignore[import-untyped]
+            from pandas.core.common import is_builtin_func  # type: ignore[import-not-found]
 
             f = is_builtin_func(func)
         else:
@@ -2225,7 +2225,7 @@ def _prepare_group_map_apply(
         ]
         psdf = psdf[[s.rename(label) for s, label in zip(groupkeys, groupkey_labels)] + agg_columns]
         groupkey_names = [label if len(label) > 1 else label[0] for label in groupkey_labels]
-        return DataFrame(psdf._internal.resolved_copy), groupkey_labels, groupkey_names
+        return DataFrame(psdf._internal.resolved_copy), groupkey_labels, groupkey_names  # type: ignore[return-value]
 
     @staticmethod
     def _spark_group_map_apply(
@@ -3719,6 +3719,7 @@ def assign_columns(
 
         for col_or_s, label in zip(by, column_labels):
             if label in tmp_column_labels:
+                assert isinstance(col_or_s, Series)
                 psser = col_or_s
                 psdf = align_diff_frames(
                     assign_columns,
@@ -3734,6 +3735,7 @@ def assign_columns(
         new_by_series = []
         for col_or_s, label in zip(by, column_labels):
             if label in tmp_column_labels:
+                assert isinstance(col_or_s, Series)
                 psser = col_or_s
                 new_by_series.append(psdf._psser_for(label).rename(psser.name))
             else:

diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py
@@ -41,7 +41,7 @@
     is_object_dtype,
 )
 from pandas.core.accessor import CachedAccessor  # type: ignore[attr-defined]
-from pandas.io.formats.printing import pprint_thing  # type: ignore[import-untyped]
+from pandas.io.formats.printing import pprint_thing  # type: ignore[import-not-found]
 from pandas.api.types import CategoricalDtype, is_hashable
 from pandas._libs import lib
 
@@ -547,7 +547,7 @@ def to_numpy(self, dtype: Optional[Union[str, Dtype]] = None, copy: bool = False
             "It should only be used if the resulting NumPy ndarray is expected to be small."
         )
         result = np.asarray(
-            self._to_internal_pandas()._values, dtype=dtype  # type: ignore[attr-defined]
+            self._to_internal_pandas()._values, dtype=dtype  # type: ignore[attr-defined, arg-type]
         )
         if copy:
             result = result.copy()

diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
@@ -1177,6 +1177,7 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn
             spark_frame_other = cast(MultiIndex, other).to_frame()._to_spark()
             keep_name = True
 
+        assert isinstance(other, MultiIndex)
         index_fields = self._index_fields_for_union_like(other, func_name="intersection")
 
         default_name: List[Name] = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)]

diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py
@@ -1567,7 +1567,7 @@ def prepare_pandas_frame(
         if retain_index:
             index_nlevels = pdf.index.nlevels
             index_columns = [SPARK_INDEX_NAME_FORMAT(i) for i in range(index_nlevels)]
-            pdf.index.names = index_columns  # type: ignore[assignment]
+            pdf.index.names = index_columns
             reset_index = pdf.reset_index()
         else:
             index_nlevels = 0

diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
@@ -1257,6 +1257,7 @@ def output_func(pdf: pd.DataFrame) -> pd.DataFrame:
         return DataFrame(psdf._internal.with_new_sdf(sdf, data_fields=return_data_fields))
 
     if isinstance(sampled, dict):
+        assert sampled is not None
         return {sn: read_excel_on_spark(pdf_or_pser, sn) for sn, pdf_or_pser in sampled.items()}
     else:
         return read_excel_on_spark(cast(Union[pd.DataFrame, pd.Series], sampled), sheet_name)
@@ -2325,10 +2326,10 @@ def get_dummies(
             values = values[1:]
 
         def column_name(v: Any) -> Name:
-            if prefix is None or prefix[i] == "":  # type: ignore[index]
+            if prefix is None or prefix[i] == "":
                 return v
             else:
-                return "{}{}{}".format(prefix[i], prefix_sep, v)  # type: ignore[index]
+                return "{}{}{}".format(prefix[i], prefix_sep, v)
 
         for value in values:
             remaining_columns.append(
@@ -2597,9 +2598,7 @@ def resolve_func(psdf, this_column_labels, that_column_labels):
             concat_psdf = concat_psdf[column_labels]
 
         if ignore_index:
-            concat_psdf.columns = list(  # type: ignore[assignment]
-                map(str, _range(len(concat_psdf.columns)))
-            )
+            concat_psdf.columns = list(map(str, _range(len(concat_psdf.columns))))
 
         if sort:
             concat_psdf = concat_psdf.sort_index()

diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py
@@ -24,8 +24,8 @@
 from matplotlib.axes._base import _process_plot_format  # type: ignore[attr-defined]
 from matplotlib.figure import Figure
 from pandas.core.dtypes.inference import is_list_like
-from pandas.io.formats.printing import pprint_thing  # type: ignore[import-untyped]
-from pandas.plotting._matplotlib import (  # type: ignore[import-untyped]
+from pandas.io.formats.printing import pprint_thing  # type: ignore[import-not-found]
+from pandas.plotting._matplotlib import (  # type: ignore[import-not-found]
     BarPlot as PandasBarPlot,
     BoxPlot as PandasBoxPlot,
     HistPlot as PandasHistPlot,
@@ -37,7 +37,7 @@
     KdePlot as PandasKdePlot,
 )
 from pandas.plotting._core import PlotAccessor
-from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot  # type: ignore[import-untyped]
+from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot  # type: ignore[import-not-found]
 
 from pyspark.pandas.plot import (
     TopNPlotBase,

diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -48,7 +48,7 @@
 import numpy as np
 import pandas as pd
 from pandas.core.accessor import CachedAccessor  # type: ignore[attr-defined]
-from pandas.io.formats.printing import pprint_thing  # type: ignore[import-untyped]
+from pandas.io.formats.printing import pprint_thing  # type: ignore[import-not-found]
 from pandas.api.extensions import no_default
 from pandas.api.types import (
     is_list_like,
@@ -1082,9 +1082,9 @@ def cov(self, other: "Series", min_periods: Optional[int] = None, ddof: int = 1)
         """
         if not isinstance(other, Series):
             raise TypeError("unsupported type: %s" % type(other))
-        if not np.issubdtype(self.dtype, np.number):
+        if not np.issubdtype(self.dtype, np.number):  # type: ignore[arg-type]
             raise TypeError("unsupported dtype: %s" % self.dtype)
-        if not np.issubdtype(other.dtype, np.number):
+        if not np.issubdtype(other.dtype, np.number):  # type: ignore[arg-type]
             raise TypeError("unsupported dtype: %s" % other.dtype)
         if not isinstance(ddof, int):
             raise TypeError("ddof must be integer")

diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py
@@ -865,7 +865,7 @@ def _new_type_holders(
                 new_param.tpe = param.stop  # type: ignore[assignment]
             else:
                 # When the given argument is a numpy's dtype instance.
-                new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
+                new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop  # type: ignore[assignment]
             new_params.append(new_param)
         return tuple(new_params)
     elif is_unnamed_params:
@@ -878,7 +878,7 @@ def _new_type_holders(
             if isinstance(param, ExtensionDtype):
                 new_type.tpe = param  # type: ignore[assignment]
             else:
-                new_type.tpe = param.type if isinstance(param, np.dtype) else param
+                new_type.tpe = param.type if isinstance(param, np.dtype) else param  # type: ignore[assignment]
             new_types.append(new_type)
         return tuple(new_types)
     else: