[SPARK-53916][PYTHON] Deduplicate the variables in PythonArrowInput

zhengruifeng · zhengruifeng · commit 61a024c1dd49 · 2025-10-16T10:56:43.000+08:00
### What changes were proposed in this pull request? Deduplicate the variables in PythonArrowInput ### Why are the changes needed? code clean up ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #52621 from zhengruifeng/unify_var_name. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowInput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowInput.scala
@@ -115,6 +115,13 @@ private[python] trait BasicPythonArrowInput extends PythonArrowInput[Iterator[In
   self: BasePythonRunner[Iterator[InternalRow], _] =>
   protected val arrowWriter: arrow.ArrowWriter = ArrowWriter.create(root)
 
+  protected val maxRecordsPerBatch: Int = {
+    val v = SQLConf.get.arrowMaxRecordsPerBatch
+    if (v > 0) v else Int.MaxValue
+  }
+
+  protected val maxBytesPerBatch: Long = SQLConf.get.arrowMaxBytesPerBatch
+
   protected def writeNextBatchToArrowStream(
       root: VectorSchemaRoot,
       writer: ArrowStreamWriter,
@@ -145,13 +152,6 @@ private[python] trait BasicPythonArrowInput extends PythonArrowInput[Iterator[In
 
 private[python] trait BatchedPythonArrowInput extends BasicPythonArrowInput {
   self: BasePythonRunner[Iterator[InternalRow], _] =>
-  private val arrowMaxRecordsPerBatch = {
-    val v = SQLConf.get.arrowMaxRecordsPerBatch
-    if (v > 0) v else Int.MaxValue
-  }
-
-  private val maxBytesPerBatch = SQLConf.get.arrowMaxBytesPerBatch
-
   // Marker inside the input iterator to indicate the start of the next batch.
   private var nextBatchStart: Iterator[InternalRow] = Iterator.empty
 
@@ -169,7 +169,7 @@ private[python] trait BatchedPythonArrowInput extends BasicPythonArrowInput {
       val startData = dataOut.size()
 
       val numRowsInBatch = BatchedPythonArrowInput.writeSizedBatch(
-        arrowWriter, writer, nextBatchStart, maxBytesPerBatch, arrowMaxRecordsPerBatch)
+        arrowWriter, writer, nextBatchStart, maxBytesPerBatch, maxRecordsPerBatch)
 
       val deltaData = dataOut.size() - startData
       pythonMetrics("pythonDataSent") += deltaData
@@ -234,14 +234,6 @@ private[python] object BatchedPythonArrowInput {
  * Enables an optimization that splits each group into the sized batches.
  */
 private[python] trait GroupedPythonArrowInput { self: RowInputArrowPythonRunner =>
-
-  val maxRecordsPerBatch: Int = {
-    val v = SQLConf.get.arrowMaxRecordsPerBatch
-    if (v > 0) v else Int.MaxValue
-  }
-
-  val maxBytesPerBatch: Long = SQLConf.get.arrowMaxBytesPerBatch
-
   protected override def newWriter(
       env: SparkEnv,
       worker: PythonWorker,