Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev/create-release/spark-rm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ RUN python3.10 -m pip install --ignore-installed 'blinker>=1.6.2' && \
RUN python3.10 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' \
sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' pyarrow pandas \
'plotly>=4.8' 'docutils<0.18.0' 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' \
'plotly>=4.8' 'docutils<0.18.0' 'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' \
'pytest-mypy-plugins==1.9.3' 'black==23.12.1' 'pandas-stubs==1.2.0.53' \
'grpcio==1.76.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' \
Expand Down
2 changes: 1 addition & 1 deletion dev/lint-python
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ FLAKE8_BUILD="flake8"
MINIMUM_FLAKE8="3.9.0"
RUFF_BUILD="ruff"
MINIMUM_RUFF="0.14.0"
MINIMUM_MYPY="1.8.0"
MINIMUM_MYPY="1.19.1"
MYPY_BUILD="mypy"
PYTEST_BUILD="pytest"

Expand Down
4 changes: 2 additions & 2 deletions dev/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ tabulate

# Linter
ruff==0.14.8
mypy==1.8.0
pytest-mypy-plugins==1.9.3
mypy==1.19.1
pytest-mypy-plugins==3.2.0
# See SPARK-38680.
pandas-stubs>=2.2.0
scipy-stubs;
Expand Down
2 changes: 1 addition & 1 deletion dev/spark-test-image/docs/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
# See 'docutils<0.18.0' in SPARK-39421
RUN python3.11 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' pyarrow 'pandas==2.3.3' 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \
'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' \
&& python3.11 -m pip cache purge
2 changes: 1 addition & 1 deletion dev/spark-test-image/lint/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ RUN python3.12 -m pip install \
'ipython_genutils' \
'jinja2' \
'matplotlib' \
'mypy==1.8.0' \
'mypy==1.19.1' \
'numpy==2.4.1' \
'numpydoc' \
'pandas' \
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/ml/pipeline_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
rid, text, prob, prediction = row
print(
"(%d, %s) --> prob=%s, prediction=%f" % (
rid, text, str(prob), prediction # type: ignore
rid, text, str(prob), prediction
)
)
# $example off$
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,8 +836,8 @@ def parallelize(self, c: Iterable[T], numSlices: Optional[int] = None) -> RDD[T]
size = len(c)
if size == 0:
return self.parallelize([], numSlices)
step = c[1] - c[0] if size > 1 else 1 # type: ignore[index]
start0 = c[0] # type: ignore[index]
step = c[1] - c[0] if size > 1 else 1
start0 = c[0]

def getStart(split: int) -> int:
assert numSlices is not None
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/core/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1664,7 +1664,7 @@ def func(it: Iterable[T]) -> Iterable[Any]:
# a generator, so it could return an iterator that we need to
# go through. We check the common case first, then deal with
# the undocumented behavior.
r = f(it)
r = f(it) # type: ignore[func-returns-value]
if r is None:
return iter([])
try:
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/errors/exceptions/connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _convert_exception(
sql_state=sql_state,
server_stacktrace=stacktrace,
display_server_stacktrace=display_server_stacktrace,
contexts=contexts,
contexts=contexts, # type: ignore[arg-type]
grpc_status_code=grpc_status_code,
breaking_change_info=breaking_change_info,
)
Expand All @@ -164,7 +164,7 @@ def _convert_exception(
sql_state=sql_state,
server_stacktrace=stacktrace,
display_server_stacktrace=display_server_stacktrace,
contexts=contexts,
contexts=contexts, # type: ignore[arg-type]
grpc_status_code=grpc_status_code,
breaking_change_info=breaking_change_info,
)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2253,7 +2253,7 @@ def setMinWeightFractionPerNode(self, value: float) -> "RandomForestClassifier":
return self._set(minWeightFractionPerNode=value)


class RandomForestClassificationModel(
class RandomForestClassificationModel( # type: ignore[misc]
_TreeEnsembleModel,
_JavaProbabilisticClassificationModel[Vector],
_RandomForestClassifierParams,
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/linalg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def dot(self, other: Iterable[float]) -> np.float64:
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
return np.dot(self.toArray(), other)
return np.dot(self.toArray(), other) # type: ignore[call-overload]

def squared_distance(self, other: Iterable[float]) -> np.float64:
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -1585,7 +1585,7 @@ def setMinWeightFractionPerNode(self, value: float) -> "RandomForestRegressor":
return self._set(minWeightFractionPerNode=value)


class RandomForestRegressionModel(
class RandomForestRegressionModel( # type: ignore[misc]
_JavaRegressionModel[Vector],
_TreeEnsembleModel,
_RandomForestRegressorParams,
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/ml/tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,13 @@


def _parallelFitTasks(
est: Estimator,
est: Estimator[Transformer],
train: DataFrame,
eva: Evaluator,
validation: DataFrame,
epm: Sequence["ParamMap"],
collectSubModel: bool,
) -> List[Callable[[], Tuple[int, float, Transformer]]]:
) -> List[Callable[[], Tuple[int, float, Union[Transformer, None]]]]:
"""
Creates a list of callables which can be called from different threads to fit and evaluate
an estimator in parallel. Each callable returns an `(index, metric)` pair.
Expand All @@ -110,7 +110,7 @@ def _parallelFitTasks(
"""
modelIter = est.fitMultiple(train, epm)

def singleTask() -> Tuple[int, float, Transformer]:
def singleTask() -> Tuple[int, float, Union[Transformer, None]]:
index, model = next(modelIter)
# TODO: duplicate evaluator to take extra params from input
# Note: Supporting tuning params in evaluator need update method
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/linalg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def dot(self, other: "VectorLike") -> np.float64:
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
return np.dot(self.toArray(), cast("ArrayLike", other)) # type: ignore[valid-type]
return np.dot(self.toArray(), cast("ArrayLike", other))

def squared_distance(self, other: "VectorLike") -> np.float64:
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/data_type_ops/boolean_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
return index_ops._with_new_scol(
scol,
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type, nullable=nullable # type: ignore[arg-type]
dtype=dtype, spark_type=spark_type, nullable=nullable
),
)
else:
Expand Down
8 changes: 2 additions & 6 deletions python/pyspark/pandas/data_type_ops/num_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,9 +623,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
).otherwise(index_ops.spark.column.cast(spark_type))
return index_ops._with_new_scol(
scol.alias(index_ops._internal.data_spark_column_names[0]),
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type # type: ignore[arg-type]
),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
elif isinstance(spark_type, StringType):
return _as_string_type(index_ops, dtype, null_str=str(np.nan))
Expand Down Expand Up @@ -763,9 +761,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
).otherwise(index_ops.spark.column.cast(spark_type))
return index_ops._with_new_scol(
scol.alias(index_ops._internal.data_spark_column_names[0]),
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type # type: ignore[arg-type]
),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
elif isinstance(spark_type, StringType):
return _as_string_type(index_ops, dtype, null_str=str(np.nan))
Expand Down
4 changes: 1 addition & 3 deletions python/pyspark/pandas/data_type_ops/string_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
)
return index_ops._with_new_scol(
scol,
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type # type: ignore[arg-type]
),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
elif isinstance(spark_type, StringType):
null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6433,7 +6433,7 @@ def replace(
def op(psser: ps.Series) -> ps.Series:
if psser.name in to_replace_dict:
return psser.replace(
to_replace=to_replace_dict[psser.name], value=value, regex=regex
to_replace=to_replace_dict[psser.name], value=value, regex=regex # type: ignore[arg-type]
)
else:
return psser
Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1980,7 +1980,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S
)

if LooseVersion(pd.__version__) < "3.0.0":
from pandas.core.common import is_builtin_func # type: ignore[import-untyped]
from pandas.core.common import is_builtin_func # type: ignore[import-not-found]

f = is_builtin_func(func)
else:
Expand Down Expand Up @@ -2249,7 +2249,7 @@ def _prepare_group_map_apply(
]
psdf = psdf[[s.rename(label) for s, label in zip(groupkeys, groupkey_labels)] + agg_columns]
groupkey_names = [label if len(label) > 1 else label[0] for label in groupkey_labels]
return DataFrame(psdf._internal.resolved_copy), groupkey_labels, groupkey_names
return DataFrame(psdf._internal.resolved_copy), groupkey_labels, groupkey_names # type: ignore[return-value]

@staticmethod
def _spark_group_map_apply(
Expand Down Expand Up @@ -3746,6 +3746,7 @@ def assign_columns(

for col_or_s, label in zip(by, column_labels):
if label in tmp_column_labels:
assert isinstance(col_or_s, Series)
psser = col_or_s
psdf = align_diff_frames(
assign_columns,
Expand All @@ -3761,6 +3762,7 @@ def assign_columns(
new_by_series = []
for col_or_s, label in zip(by, column_labels):
if label in tmp_column_labels:
assert isinstance(col_or_s, Series)
psser = col_or_s
new_by_series.append(psdf._psser_for(label).rename(psser.name))
else:
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
is_object_dtype,
)
from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-untyped]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-not-found]
from pandas.api.types import CategoricalDtype, is_hashable
from pandas._libs import lib

Expand Down Expand Up @@ -547,7 +547,7 @@ def to_numpy(self, dtype: Optional[Union[str, Dtype]] = None, copy: bool = False
"It should only be used if the resulting NumPy ndarray is expected to be small."
)
result = np.asarray(
self._to_internal_pandas()._values, dtype=dtype # type: ignore[attr-defined]
self._to_internal_pandas()._values, dtype=dtype # type: ignore[attr-defined, arg-type]
)
if copy:
result = result.copy()
Expand Down
1 change: 1 addition & 0 deletions python/pyspark/pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,7 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn
spark_frame_other = cast(MultiIndex, other).to_frame()._to_spark()
keep_name = True

assert isinstance(other, MultiIndex)
index_fields = self._index_fields_for_union_like(other, func_name="intersection")

default_name: List[Name] = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)]
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1567,7 +1567,7 @@ def prepare_pandas_frame(
if retain_index:
index_nlevels = pdf.index.nlevels
index_columns = [SPARK_INDEX_NAME_FORMAT(i) for i in range(index_nlevels)]
pdf.index.names = index_columns # type: ignore[assignment]
pdf.index.names = index_columns
reset_index = pdf.reset_index()
else:
index_nlevels = 0
Expand Down
9 changes: 4 additions & 5 deletions python/pyspark/pandas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1269,6 +1269,7 @@ def output_func(pdf: pd.DataFrame) -> pd.DataFrame:
return DataFrame(psdf._internal.with_new_sdf(sdf, data_fields=return_data_fields))

if isinstance(sampled, dict):
assert sampled is not None
return {sn: read_excel_on_spark(pdf_or_pser, sn) for sn, pdf_or_pser in sampled.items()}
else:
return read_excel_on_spark(cast(Union[pd.DataFrame, pd.Series], sampled), sheet_name)
Expand Down Expand Up @@ -2340,10 +2341,10 @@ def get_dummies(
values = values[1:]

def column_name(v: Any) -> Name:
if prefix is None or prefix[i] == "": # type: ignore[index]
if prefix is None or prefix[i] == "":
return v
else:
return "{}{}{}".format(prefix[i], prefix_sep, v) # type: ignore[index]
return "{}{}{}".format(prefix[i], prefix_sep, v)

for value in values:
remaining_columns.append(
Expand Down Expand Up @@ -2612,9 +2613,7 @@ def resolve_func(psdf, this_column_labels, that_column_labels):
concat_psdf = concat_psdf[column_labels]

if ignore_index:
concat_psdf.columns = list( # type: ignore[assignment]
map(str, _range(len(concat_psdf.columns)))
)
concat_psdf.columns = list(map(str, _range(len(concat_psdf.columns))))

if sort:
concat_psdf = concat_psdf.sort_index()
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/plot/matplotlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
from matplotlib.figure import Figure
import pandas as pd
from pandas.core.dtypes.inference import is_list_like
from pandas.io.formats.printing import pprint_thing # type: ignore[import-untyped]
from pandas.plotting._matplotlib import ( # type: ignore[import-untyped]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-not-found]
from pandas.plotting._matplotlib import ( # type: ignore[import-not-found]
BarPlot as PandasBarPlot,
BoxPlot as PandasBoxPlot,
HistPlot as PandasHistPlot,
Expand All @@ -38,7 +38,7 @@
KdePlot as PandasKdePlot,
)
from pandas.plotting._core import PlotAccessor
from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot # type: ignore[import-untyped]
from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot # type: ignore[import-not-found]

from pyspark.pandas.plot import (
TopNPlotBase,
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
import numpy as np
import pandas as pd
from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-untyped]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-not-found]
from pandas.api.extensions import no_default
from pandas.api.types import (
is_list_like,
Expand Down Expand Up @@ -1082,9 +1082,9 @@ def cov(self, other: "Series", min_periods: Optional[int] = None, ddof: int = 1)
"""
if not isinstance(other, Series):
raise TypeError("unsupported type: %s" % type(other))
if not np.issubdtype(self.dtype, np.number):
if not np.issubdtype(self.dtype, np.number): # type: ignore[arg-type]
raise TypeError("unsupported dtype: %s" % self.dtype)
if not np.issubdtype(other.dtype, np.number):
if not np.issubdtype(other.dtype, np.number): # type: ignore[arg-type]
raise TypeError("unsupported dtype: %s" % other.dtype)
if not isinstance(ddof, int):
raise TypeError("ddof must be integer")
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/typedef/typehints.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,7 +865,7 @@ def _new_type_holders(
new_param.tpe = param.stop # type: ignore[assignment]
else:
# When the given argument is a numpy's dtype instance.
new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop # type: ignore[assignment]
new_params.append(new_param)
return tuple(new_params)
elif is_unnamed_params:
Expand All @@ -878,7 +878,7 @@ def _new_type_holders(
if isinstance(param, ExtensionDtype):
new_type.tpe = param # type: ignore[assignment]
else:
new_type.tpe = param.type if isinstance(param, np.dtype) else param
new_type.tpe = param.type if isinstance(param, np.dtype) else param # type: ignore[assignment]
new_types.append(new_type)
return tuple(new_types)
else:
Expand Down
16 changes: 8 additions & 8 deletions python/pyspark/pandas/usage_logging/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,15 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
(pd.Index, MissingPandasLikeIndex),
(pd.MultiIndex, MissingPandasLikeMultiIndex),
(pd.DatetimeIndex, MissingPandasLikeDatetimeIndex),
(pd.core.groupby.DataFrameGroupBy, MissingPandasLikeDataFrameGroupBy),
(pd.core.groupby.SeriesGroupBy, MissingPandasLikeSeriesGroupBy),
(pd.core.window.Expanding, MissingPandasLikeExpanding),
(pd.core.window.Rolling, MissingPandasLikeRolling),
(pd.core.window.ExpandingGroupby, MissingPandasLikeExpandingGroupby),
(pd.core.window.RollingGroupby, MissingPandasLikeRollingGroupby),
(pd.core.window.ExponentialMovingWindow, MissingPandasLikeExponentialMoving),
(pd.core.groupby.DataFrameGroupBy, MissingPandasLikeDataFrameGroupBy), # type: ignore[attr-defined]
(pd.core.groupby.SeriesGroupBy, MissingPandasLikeSeriesGroupBy), # type: ignore[attr-defined]
(pd.core.window.Expanding, MissingPandasLikeExpanding), # type: ignore[attr-defined]
(pd.core.window.Rolling, MissingPandasLikeRolling), # type: ignore[attr-defined]
(pd.core.window.ExpandingGroupby, MissingPandasLikeExpandingGroupby), # type: ignore[attr-defined]
(pd.core.window.RollingGroupby, MissingPandasLikeRollingGroupby), # type: ignore[attr-defined]
(pd.core.window.ExponentialMovingWindow, MissingPandasLikeExponentialMoving), # type: ignore[attr-defined]
(
pd.core.window.ExponentialMovingWindowGroupby,
pd.core.window.ExponentialMovingWindowGroupby, # type: ignore[attr-defined]
MissingPandasLikeExponentialMovingGroupby,
),
]
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def align_diff_frames(
["DataFrame", List[Label], List[Label]], Iterator[Tuple["Series", Label]]
],
this: "DataFrame",
that: "DataFrame",
that: "DataFrameOrSeries",
fillna: bool = True,
how: str = "full",
preserve_order_column: bool = False,
Expand Down
Loading