[SPARK-XXXXX][PYTHON] Remove unnecessary .keys() in dict iterations

zhengruifeng · zhengruifeng · commit eee2e817117e · 2026-04-10T06:43:32.000Z
Iterating over `dict.keys()` is redundant when iterating the dict directly suffices. Similarly, `len(dict.keys())` can be simplified to `len(dict)`.

Co-authored-by: Isaac
diff --git a/python/pyspark/memory_profiler_ext.py b/python/pyspark/memory_profiler_ext.py
@@ -123,7 +123,7 @@ def items(self) -> Iterator[Tuple[str, Iterator[Tuple[int, Any]]]]:
                 measures = self[code]
                 if not measures:
                     continue  # skip if no measurement
-                line_iterator = ((line, measures[line]) for line in measures.keys())
+                line_iterator = ((line, measures[line]) for line in measures)
                 yield (filename, line_iterator)
 
     class UDFLineProfiler(LineProfiler):
diff --git a/python/pyspark/pandas/config.py b/python/pyspark/pandas/config.py
@@ -509,7 +509,7 @@ def __setattr__(self, key: str, val: Any) -> None:
         canonical_key = prefix + key
 
         candidates = [
-            k for k in d.keys() if all(x in k.split(".") for x in canonical_key.split("."))
+            k for k in d if all(x in k.split(".") for x in canonical_key.split("."))
         ]
         if len(candidates) == 1 and candidates[0] == canonical_key:
             set_option(canonical_key, val)
@@ -528,7 +528,7 @@ def __getattr__(self, key: str) -> Union["DictWrapper", Any]:
         canonical_key = prefix + key
 
         candidates = [
-            k for k in d.keys() if all(x in k.split(".") for x in canonical_key.split("."))
+            k for k in d if all(x in k.split(".") for x in canonical_key.split("."))
         ]
         if len(candidates) == 1 and candidates[0] == canonical_key:
             return get_option(canonical_key)
@@ -549,7 +549,7 @@ def __dir__(self) -> List[str]:
             candidates = d.keys()
             offset = 0
         else:
-            candidates = [k for k in d.keys() if all(x in k.split(".") for x in prefix.split("."))]
+            candidates = [k for k in d if all(x in k.split(".") for x in prefix.split("."))]
             offset = len(prefix) + 1  # prefix (e.g. "compute.") to trim.
         return [c[offset:] for c in candidates]
 
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -7072,7 +7072,7 @@ def pivot_table(
                 index_map[colname] = None
             internal = InternalFrame(
                 spark_frame=sdf,
-                index_spark_columns=[scol_for(sdf, col) for col in index_map.keys()],
+                index_spark_columns=[scol_for(sdf, col) for col in index_map],
                 index_names=list(index_map.values()),
                 column_label_names=[columns],
             )
@@ -9778,7 +9778,7 @@ def astype(self, dtype: Union[str, Dtype, Dict[Name, Union[str, Dtype]]]) -> "Da
         applied = []
         if is_dict_like(dtype):
             dtype_dict = cast(Dict[Name, Union[str, Dtype]], dtype)
-            for col_name in dtype_dict.keys():
+            for col_name in dtype_dict:
                 if col_name not in self.columns:
                     raise KeyError(
                         "Only a column name can be used for the key in a dtype mappings argument."
diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
@@ -1784,7 +1784,7 @@ def pandas_to_datetime(
     if isinstance(arg, Series):
         return arg.pandas_on_spark.transform_batch(pandas_to_datetime)
     if isinstance(arg, DataFrame):
-        unit = {k: _unit_map[k.lower()] for k in arg.keys() if k.lower() in _unit_map}
+        unit = {k: _unit_map[k.lower()] for k in arg if k.lower() in _unit_map}
         unit_rev = {v: k for k, v in unit.items()}
         list_cols = [unit_rev["year"], unit_rev["month"], unit_rev["day"]]
         for u in ["h", "m", "s", "ms", "us"]:
diff --git a/python/pyspark/pipelines/cli.py b/python/pyspark/pipelines/cli.py
@@ -165,7 +165,7 @@ def unpack_pipeline_spec(spec_data: Mapping[str, Any]) -> PipelineSpec:
         "libraries",
     }
     REQUIRED_FIELDS = ["name", "storage"]
-    for key in spec_data.keys():
+    for key in spec_data:
         if key not in ALLOWED_FIELDS:
             raise PySparkException(
                 errorClass="PIPELINE_SPEC_UNEXPECTED_FIELD", messageParameters={"field_name": key}
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -2595,14 +2595,14 @@ def _int_size_to_type(
 }
 
 # compute array typecode mappings for signed integer types
-for _typecode in _array_signed_int_typecode_ctype_mappings.keys():
+for _typecode in _array_signed_int_typecode_ctype_mappings:
     size = ctypes.sizeof(_array_signed_int_typecode_ctype_mappings[_typecode]) * 8
     dt = _int_size_to_type(size)
     if dt is not None:
         _array_type_mappings[_typecode] = dt
 
 # compute array typecode mappings for unsigned integer types
-for _typecode in _array_unsigned_int_typecode_ctype_mappings.keys():
+for _typecode in _array_unsigned_int_typecode_ctype_mappings:
     # JVM does not have unsigned types, so use signed types that is at least 1
     # bit larger to store
     size = ctypes.sizeof(_array_unsigned_int_typecode_ctype_mappings[_typecode]) * 8 + 1
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
@@ -1063,9 +1063,9 @@ def compare_vals(val1, val2):
                 return all(compare_vals(x, y) for x, y in zip(val1, val2))
             elif isinstance(val1, dict) and isinstance(val2, dict):
                 return (
-                    len(val1.keys()) == len(val2.keys())
+                    len(val1) == len(val2)
                     and val1.keys() == val2.keys()
-                    and all(compare_vals(val1[k], val2[k]) for k in val1.keys())
+                    and all(compare_vals(val1[k], val2[k]) for k in val1)
                 )
             elif isinstance(val1, float) and isinstance(val2, float):
                 if abs(val1 - val2) > (atol + rtol * abs(val2)):

Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ def unpack_pipeline_spec(spec_data: Mapping[str, Any]) -> PipelineSpec:`
`165`	`165`	`"libraries",`
`166`	`166`	`}`
`167`	`167`	`REQUIRED_FIELDS = ["name", "storage"]`
`168`		`- for key in spec_data.keys():`
	`168`	`+ for key in spec_data:`
`169`	`169`	`if key not in ALLOWED_FIELDS:`
`170`	`170`	`raise PySparkException(`
`171`	`171`	`errorClass="PIPELINE_SPEC_UNEXPECTED_FIELD", messageParameters={"field_name": key}`