Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev/create-release/spark-rm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ RUN python3.10 -m pip install --ignore-installed 'blinker>=1.6.2' && \
RUN python3.10 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' \
sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' pyarrow pandas \
'plotly>=4.8' 'docutils<0.18.0' 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' \
'plotly>=4.8' 'docutils<0.18.0' 'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' \
'pytest-mypy-plugins==1.9.3' 'black==23.12.1' 'pandas-stubs==1.2.0.53' \
'grpcio==1.76.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' \
Expand Down
2 changes: 1 addition & 1 deletion dev/lint-python
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ FLAKE8_BUILD="flake8"
MINIMUM_FLAKE8="3.9.0"
RUFF_BUILD="ruff"
MINIMUM_RUFF="0.14.0"
MINIMUM_MYPY="1.8.0"
MINIMUM_MYPY="1.19.1"
MYPY_BUILD="mypy"
PYTEST_BUILD="pytest"

Expand Down
2 changes: 1 addition & 1 deletion dev/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ coverage

# Linter
ruff==0.14.8
mypy==1.8.0
mypy==1.19.1
pytest-mypy-plugins==1.9.3
# See SPARK-38680.
pandas-stubs>=2.2.0
Expand Down
2 changes: 1 addition & 1 deletion dev/spark-test-image/docs/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
# See 'docutils<0.18.0' in SPARK-39421
RUN python3.11 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' pyarrow 'pandas==2.3.3' 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \
'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' \
&& python3.11 -m pip cache purge
2 changes: 1 addition & 1 deletion dev/spark-test-image/lint/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ RUN python3.12 -m pip install \
'ipython_genutils' \
'jinja2' \
'matplotlib' \
'mypy==1.8.0' \
'mypy==1.19.1' \
'numpy==2.4.1' \
'numpydoc' \
'pandas' \
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/ml/pipeline_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
rid, text, prob, prediction = row
print(
"(%d, %s) --> prob=%s, prediction=%f" % (
rid, text, str(prob), prediction # type: ignore
rid, text, str(prob), prediction
)
)
# $example off$
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
sortedCount: RDD[Tuple[int, int]] = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (int(x), 1)) \
.sortByKey()
.sortByKey() # type: ignore[call-overload]
# This is just a demo on how to bring all the sorted data back to a single node.
# In reality, we wouldn't want to collect all the data to the driver node.
output = sortedCount.collect()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def line_to_tuple(line: str) -> Tuple[str, str]:
.transform(lambda rdd: word_sentiments.join(rdd)) \
.map(lambda word_tuples: (word_tuples[0], float(word_tuples[1][0]) * word_tuples[1][1])) \
.map(lambda word_happiness: (word_happiness[1], word_happiness[0])) \
.transform(lambda rdd: rdd.sortByKey(False))
.transform(lambda rdd: rdd.sortByKey(False)) # type: ignore[call-overload]

happiest_words.foreachRDD(print_happiest_words)

Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,8 +834,8 @@ def parallelize(self, c: Iterable[T], numSlices: Optional[int] = None) -> RDD[T]
size = len(c)
if size == 0:
return self.parallelize([], numSlices)
step = c[1] - c[0] if size > 1 else 1 # type: ignore[index]
start0 = c[0] # type: ignore[index]
step = c[1] - c[0] if size > 1 else 1
start0 = c[0]

def getStart(split: int) -> int:
assert numSlices is not None
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/core/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1664,7 +1664,7 @@ def func(it: Iterable[T]) -> Iterable[Any]:
# a generator, so it could return an iterator that we need to
# go through. We check the common case first, then deal with
# the undocumented behavior.
r = f(it)
r = f(it) # type: ignore[func-returns-value]
if r is None:
return iter([])
try:
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/errors/exceptions/connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _convert_exception(
sql_state=sql_state,
server_stacktrace=stacktrace,
display_server_stacktrace=display_server_stacktrace,
contexts=contexts,
contexts=contexts, # type: ignore[arg-type]
grpc_status_code=grpc_status_code,
breaking_change_info=breaking_change_info,
)
Expand All @@ -164,7 +164,7 @@ def _convert_exception(
sql_state=sql_state,
server_stacktrace=stacktrace,
display_server_stacktrace=display_server_stacktrace,
contexts=contexts,
contexts=contexts, # type: ignore[arg-type]
grpc_status_code=grpc_status_code,
breaking_change_info=breaking_change_info,
)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2253,7 +2253,7 @@ def setMinWeightFractionPerNode(self, value: float) -> "RandomForestClassifier":
return self._set(minWeightFractionPerNode=value)


class RandomForestClassificationModel(
class RandomForestClassificationModel( # type: ignore[misc]
_TreeEnsembleModel,
_JavaProbabilisticClassificationModel[Vector],
_RandomForestClassifierParams,
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/linalg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def dot(self, other: Iterable[float]) -> np.float64:
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
return np.dot(self.toArray(), other)
return np.dot(self.toArray(), other) # type: ignore[call-overload]

def squared_distance(self, other: Iterable[float]) -> np.float64:
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -1585,7 +1585,7 @@ def setMinWeightFractionPerNode(self, value: float) -> "RandomForestRegressor":
return self._set(minWeightFractionPerNode=value)


class RandomForestRegressionModel(
class RandomForestRegressionModel( # type: ignore[misc]
_JavaRegressionModel[Vector],
_TreeEnsembleModel,
_RandomForestRegressorParams,
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/ml/tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,13 @@


def _parallelFitTasks(
est: Estimator,
est: Estimator[Transformer],
train: DataFrame,
eva: Evaluator,
validation: DataFrame,
epm: Sequence["ParamMap"],
collectSubModel: bool,
) -> List[Callable[[], Tuple[int, float, Transformer]]]:
) -> List[Callable[[], Tuple[int, float, Union[Transformer, None]]]]:
"""
Creates a list of callables which can be called from different threads to fit and evaluate
an estimator in parallel. Each callable returns an `(index, metric)` pair.
Expand All @@ -110,7 +110,7 @@ def _parallelFitTasks(
"""
modelIter = est.fitMultiple(train, epm)

def singleTask() -> Tuple[int, float, Transformer]:
def singleTask() -> Tuple[int, float, Union[Transformer, None]]:
index, model = next(modelIter)
# TODO: duplicate evaluator to take extra params from input
# Note: Supporting tuning params in evaluator need update method
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/linalg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def dot(self, other: "VectorLike") -> np.float64:
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
return np.dot(self.toArray(), cast("ArrayLike", other)) # type: ignore[valid-type]
return np.dot(self.toArray(), cast("ArrayLike", other))

def squared_distance(self, other: "VectorLike") -> np.float64:
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/data_type_ops/boolean_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
return index_ops._with_new_scol(
scol,
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type, nullable=nullable # type: ignore[arg-type]
dtype=dtype, spark_type=spark_type, nullable=nullable
),
)
else:
Expand Down
8 changes: 2 additions & 6 deletions python/pyspark/pandas/data_type_ops/num_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,9 +623,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
).otherwise(index_ops.spark.column.cast(spark_type))
return index_ops._with_new_scol(
scol.alias(index_ops._internal.data_spark_column_names[0]),
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type # type: ignore[arg-type]
),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
elif isinstance(spark_type, StringType):
return _as_string_type(index_ops, dtype, null_str=str(np.nan))
Expand Down Expand Up @@ -763,9 +761,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
).otherwise(index_ops.spark.column.cast(spark_type))
return index_ops._with_new_scol(
scol.alias(index_ops._internal.data_spark_column_names[0]),
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type # type: ignore[arg-type]
),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
elif isinstance(spark_type, StringType):
return _as_string_type(index_ops, dtype, null_str=str(np.nan))
Expand Down
4 changes: 1 addition & 3 deletions python/pyspark/pandas/data_type_ops/string_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,7 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
)
return index_ops._with_new_scol(
scol,
field=index_ops._internal.data_fields[0].copy(
dtype=dtype, spark_type=spark_type # type: ignore[arg-type]
),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
elif isinstance(spark_type, StringType):
null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6433,7 +6433,7 @@ def replace(
def op(psser: ps.Series) -> ps.Series:
if psser.name in to_replace_dict:
return psser.replace(
to_replace=to_replace_dict[psser.name], value=value, regex=regex
to_replace=to_replace_dict[psser.name], value=value, regex=regex # type: ignore[arg-type]
)
else:
return psser
Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1956,7 +1956,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S
)

if LooseVersion(pd.__version__) < "3.0.0":
from pandas.core.common import is_builtin_func # type: ignore[import-untyped]
from pandas.core.common import is_builtin_func # type: ignore[import-not-found]

f = is_builtin_func(func)
else:
Expand Down Expand Up @@ -2225,7 +2225,7 @@ def _prepare_group_map_apply(
]
psdf = psdf[[s.rename(label) for s, label in zip(groupkeys, groupkey_labels)] + agg_columns]
groupkey_names = [label if len(label) > 1 else label[0] for label in groupkey_labels]
return DataFrame(psdf._internal.resolved_copy), groupkey_labels, groupkey_names
return DataFrame(psdf._internal.resolved_copy), groupkey_labels, groupkey_names # type: ignore[return-value]

@staticmethod
def _spark_group_map_apply(
Expand Down Expand Up @@ -3719,6 +3719,7 @@ def assign_columns(

for col_or_s, label in zip(by, column_labels):
if label in tmp_column_labels:
assert isinstance(col_or_s, Series)
psser = col_or_s
psdf = align_diff_frames(
assign_columns,
Expand All @@ -3734,6 +3735,7 @@ def assign_columns(
new_by_series = []
for col_or_s, label in zip(by, column_labels):
if label in tmp_column_labels:
assert isinstance(col_or_s, Series)
psser = col_or_s
new_by_series.append(psdf._psser_for(label).rename(psser.name))
else:
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
is_object_dtype,
)
from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-untyped]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-not-found]
from pandas.api.types import CategoricalDtype, is_hashable
from pandas._libs import lib

Expand Down Expand Up @@ -547,7 +547,7 @@ def to_numpy(self, dtype: Optional[Union[str, Dtype]] = None, copy: bool = False
"It should only be used if the resulting NumPy ndarray is expected to be small."
)
result = np.asarray(
self._to_internal_pandas()._values, dtype=dtype # type: ignore[attr-defined]
self._to_internal_pandas()._values, dtype=dtype # type: ignore[attr-defined, arg-type]
)
if copy:
result = result.copy()
Expand Down
1 change: 1 addition & 0 deletions python/pyspark/pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,7 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn
spark_frame_other = cast(MultiIndex, other).to_frame()._to_spark()
keep_name = True

assert isinstance(other, MultiIndex)
index_fields = self._index_fields_for_union_like(other, func_name="intersection")

default_name: List[Name] = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)]
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1567,7 +1567,7 @@ def prepare_pandas_frame(
if retain_index:
index_nlevels = pdf.index.nlevels
index_columns = [SPARK_INDEX_NAME_FORMAT(i) for i in range(index_nlevels)]
pdf.index.names = index_columns # type: ignore[assignment]
pdf.index.names = index_columns
reset_index = pdf.reset_index()
else:
index_nlevels = 0
Expand Down
9 changes: 4 additions & 5 deletions python/pyspark/pandas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,7 @@ def output_func(pdf: pd.DataFrame) -> pd.DataFrame:
return DataFrame(psdf._internal.with_new_sdf(sdf, data_fields=return_data_fields))

if isinstance(sampled, dict):
assert sampled is not None
return {sn: read_excel_on_spark(pdf_or_pser, sn) for sn, pdf_or_pser in sampled.items()}
else:
return read_excel_on_spark(cast(Union[pd.DataFrame, pd.Series], sampled), sheet_name)
Expand Down Expand Up @@ -2325,10 +2326,10 @@ def get_dummies(
values = values[1:]

def column_name(v: Any) -> Name:
if prefix is None or prefix[i] == "": # type: ignore[index]
if prefix is None or prefix[i] == "":
return v
else:
return "{}{}{}".format(prefix[i], prefix_sep, v) # type: ignore[index]
return "{}{}{}".format(prefix[i], prefix_sep, v)

for value in values:
remaining_columns.append(
Expand Down Expand Up @@ -2597,9 +2598,7 @@ def resolve_func(psdf, this_column_labels, that_column_labels):
concat_psdf = concat_psdf[column_labels]

if ignore_index:
concat_psdf.columns = list( # type: ignore[assignment]
map(str, _range(len(concat_psdf.columns)))
)
concat_psdf.columns = list(map(str, _range(len(concat_psdf.columns))))

if sort:
concat_psdf = concat_psdf.sort_index()
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/plot/matplotlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from matplotlib.axes._base import _process_plot_format # type: ignore[attr-defined]
from matplotlib.figure import Figure
from pandas.core.dtypes.inference import is_list_like
from pandas.io.formats.printing import pprint_thing # type: ignore[import-untyped]
from pandas.plotting._matplotlib import ( # type: ignore[import-untyped]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-not-found]
from pandas.plotting._matplotlib import ( # type: ignore[import-not-found]
BarPlot as PandasBarPlot,
BoxPlot as PandasBoxPlot,
HistPlot as PandasHistPlot,
Expand All @@ -37,7 +37,7 @@
KdePlot as PandasKdePlot,
)
from pandas.plotting._core import PlotAccessor
from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot # type: ignore[import-untyped]
from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot # type: ignore[import-not-found]

from pyspark.pandas.plot import (
TopNPlotBase,
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
import numpy as np
import pandas as pd
from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-untyped]
from pandas.io.formats.printing import pprint_thing # type: ignore[import-not-found]
from pandas.api.extensions import no_default
from pandas.api.types import (
is_list_like,
Expand Down Expand Up @@ -1082,9 +1082,9 @@ def cov(self, other: "Series", min_periods: Optional[int] = None, ddof: int = 1)
"""
if not isinstance(other, Series):
raise TypeError("unsupported type: %s" % type(other))
if not np.issubdtype(self.dtype, np.number):
if not np.issubdtype(self.dtype, np.number): # type: ignore[arg-type]
raise TypeError("unsupported dtype: %s" % self.dtype)
if not np.issubdtype(other.dtype, np.number):
if not np.issubdtype(other.dtype, np.number): # type: ignore[arg-type]
raise TypeError("unsupported dtype: %s" % other.dtype)
if not isinstance(ddof, int):
raise TypeError("ddof must be integer")
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/typedef/typehints.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,7 +865,7 @@ def _new_type_holders(
new_param.tpe = param.stop # type: ignore[assignment]
else:
# When the given argument is a numpy's dtype instance.
new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop
new_param.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop # type: ignore[assignment]
new_params.append(new_param)
return tuple(new_params)
elif is_unnamed_params:
Expand All @@ -878,7 +878,7 @@ def _new_type_holders(
if isinstance(param, ExtensionDtype):
new_type.tpe = param # type: ignore[assignment]
else:
new_type.tpe = param.type if isinstance(param, np.dtype) else param
new_type.tpe = param.type if isinstance(param, np.dtype) else param # type: ignore[assignment]
new_types.append(new_type)
return tuple(new_types)
else:
Expand Down
Loading