[SPARK-55147][SS] Scope timestamp range for time-interval join retrieval in V4 state format

nicholaschew11 · HeartSaVioR · commit 61fb41f3b9e1 · 2026-03-20T07:01:46.000+09:00
### What changes were proposed in this pull request? This PR improves the retrieval operation in the V4 stream-stream join state manager to scope the timestamp range for time-interval joins. Instead of scanning all timestamps for a given key during prefix scan, V4 now extracts constant interval offsets from the join condition and computes a `(minTs, maxTs)` range per input row, enabling the prefix scan to skip entries before `minTs` and terminate early past `maxTs`. - Add `scanRangeOffsets` and `computeTimestampRange` to `OneSideHashJoiner`, using `StreamingJoinHelper.getStateValueWatermark(eventWatermark=0)` to extract interval bounds from the join condition - Add `timestampRange` parameter to `getJoinedRows` in the state manager trait, V4 implementation, and V1-V3 base class (ignored by V1-V3) - Add `getValuesInRange` to `KeyWithTsToValuesStore` that filters by range and stops early past the upper bound - `getValues` now delegates to `getValuesInRange(Long.MinValue, Long.MaxValue)` ### Why are the changes needed? For time-interval joins, the V4 state format stores values indexed by `(key, timestamp)`. Without range scoping, retrieving matches requires scanning all timestamps for a key via prefix scan, even though the join condition constrains matching to a specific time window. With this change, the scan is bounded to only the relevant timestamp range, reducing I/O proportionally to the ratio of the interval width to the total timestamp span in state. ### Does this PR introduce _any_ user-facing change? No. V4 state format is experimental and gated behind `spark.sql.streaming.join.stateFormatV4.enabled`. ### How was this patch tested? New unit tests in `SymmetricHashJoinStateManagerEventTimeInValueSuite`: - `getJoinedRows with timestampRange`: boundary conditions, exact matches, empty ranges, full range - `timestampRange with multiple values per timestamp`: multiple values at the same timestamp Existing V4 join suites (Inner, Outer, FullOuter, LeftSemi) all pass. ### Was this patch authored or co-authored using generative AI tooling? Yes. (Claude Opus 4.6) Closes #54879 from nicholaschew11/SPARK-55147-range-scan-v4. Authored-by: Nicholas Chew <chew.nicky@gmail.com> Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/join/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/join/StreamingSymmetricHashJoinExec.scala
@@ -23,7 +23,8 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, JoinedRow, Literal, Predicate, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, GenericInternalRow, JoinedRow, Literal, Predicate, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
@@ -682,6 +683,50 @@ case class StreamingSymmetricHashJoinExec(
     private[this] val allowMultipleStatefulOperators: Boolean =
       conf.getConf(SQLConf.STATEFUL_OPERATOR_ALLOW_MULTIPLE)
 
+    // V4 range scan for time-interval joins (SPARK-55147). Extracts constant interval
+    // offsets from the join condition using getStateValueWatermark(eventWatermark=0).
+    // The -1 eviction adjustment widens range by ~1ms/side; postJoinFilter handles exact bounds.
+    private[this] val scanRangeOffsets: Option[(Long, Long)] = {
+      val isV4TimeIntervalJoin = stateFormatVersion >= 4 && (stateWatermarkPredicate match {
+        case Some(_: JoinStateValueWatermarkPredicate) => true
+        case _ => false
+      })
+
+      if (!isV4TimeIntervalJoin) {
+        None
+      } else {
+        val (thisSideAttrs, otherSideAttrs) = joinSide match {
+          case LeftSide => (left.output, right.output)
+          case RightSide => (right.output, left.output)
+        }
+
+        val lowerBoundMs = StreamingJoinHelper.getStateValueWatermark(
+          AttributeSet(otherSideAttrs), AttributeSet(thisSideAttrs), condition.full, Some(0L))
+        val upperBoundMs = StreamingJoinHelper.getStateValueWatermark(
+          AttributeSet(thisSideAttrs), AttributeSet(otherSideAttrs), condition.full, Some(0L))
+
+        (lowerBoundMs, upperBoundMs) match {
+          case (Some(lower), Some(upper)) =>
+            Some((lower * 1000L, -upper * 1000L)) // ms -> us
+          case _ => None
+        }
+      }
+    }
+
+    private[this] val eventTimeIdxForRangeScan: Int = scanRangeOffsets.map { _ =>
+      WatermarkSupport.findEventTimeColumnIndex(
+        inputAttributes, !allowMultipleStatefulOperators).getOrElse(-1)
+    }.getOrElse(-1)
+
+    private def computeTimestampRange(thisRow: UnsafeRow): Option[(Long, Long)] = {
+      scanRangeOffsets match {
+        case Some((lowerOffset, upperOffset)) if eventTimeIdxForRangeScan >= 0 =>
+          val eventTimeUs = thisRow.getLong(eventTimeIdxForRangeScan)
+          Some((eventTimeUs + lowerOffset, eventTimeUs + upperOffset))
+        case _ => None
+      }
+    }
+
     /**
      * Generate joined rows by consuming input from this side, and matching it with the buffered
      * rows (i.e. state) of the other side.
@@ -758,7 +803,8 @@ case class StreamingSymmetricHashJoinExec(
             otherSideJoiner.joinStateManager.getJoinedRows(
               key,
               thatRow => generateJoinedRow(thisRow, thatRow),
-              postJoinFilter)
+              postJoinFilter,
+              timestampRange = computeTimestampRange(thisRow))
           }
           val outputIter = generateOutputIter(thisRow, joinedRowIter)
           new AddingProcessedRowToStateCompletionIterator(key, thisRow, outputIter)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/join/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/join/SymmetricHashJoinStateManager.scala
@@ -67,11 +67,16 @@ trait SymmetricHashJoinStateManager {
    * required to do so.
    *
    * It is caller's responsibility to consume the whole iterator.
+   *
+   * @param timestampRange Optional optimization hint as (minTimestamp, maxTimestamp), both
+   *   inclusive. Derived classes may use it to reduce scan scope but are free to ignore it.
+   *   The predicate must produce correct output regardless of whether this hint is leveraged.
    */
   def getJoinedRows(
       key: UnsafeRow,
       generateJoinedRow: InternalRow => JoinedRow,
-      predicate: JoinedRow => Boolean): Iterator[JoinedRow]
+      predicate: JoinedRow => Boolean,
+      timestampRange: Option[(Long, Long)] = None): Iterator[JoinedRow]
 
   /**
    * Retrieve all joined rows for the given key and remove the matched rows from state. The joined
@@ -343,9 +348,8 @@ class SymmetricHashJoinStateManagerV4(
   override def getJoinedRows(
       key: UnsafeRow,
       generateJoinedRow: InternalRow => JoinedRow,
-      predicate: JoinedRow => Boolean): Iterator[JoinedRow] = {
-    // TODO: [SPARK-55147] We could improve this method to get the scope of timestamp and scan keys
-    //  more efficiently. For now, we just get all values for the key.
+      predicate: JoinedRow => Boolean,
+      timestampRange: Option[(Long, Long)] = None): Iterator[JoinedRow] = {
     def getJoinedRowsFromTsAndValues(
         ts: Long,
         valuesAndMatched: Array[ValueAndMatchPair]): Iterator[JoinedRow] = {
@@ -399,7 +403,8 @@ class SymmetricHashJoinStateManagerV4(
         getJoinedRowsFromTsAndValues(ts, valuesAndMatchedIter.toArray)
 
       case _ =>
-        keyWithTsToValues.getValues(key).flatMap { result =>
+        val (minTs, maxTs) = timestampRange.getOrElse((Long.MinValue, Long.MaxValue))
+        keyWithTsToValues.getValuesInRange(key, minTs, maxTs).flatMap { result =>
           val ts = result.timestamp
           val valuesAndMatched = result.values.toArray
           getJoinedRowsFromTsAndValues(ts, valuesAndMatched)
@@ -626,66 +631,64 @@ class SymmetricHashJoinStateManagerV4(
 
     // NOTE: This assumes we consume the whole iterator to trigger completion.
     def getValues(key: UnsafeRow): Iterator[GetValuesResult] = {
+      getValuesInRange(key, Long.MinValue, Long.MaxValue)
+    }
+
+    /**
+     * Returns entries where minTs <= timestamp <= maxTs (both inclusive), grouped by timestamp.
+     * Skips entries before minTs and stops iterating past maxTs (timestamps are sorted).
+     */
+    def getValuesInRange(
+        key: UnsafeRow, minTs: Long, maxTs: Long): Iterator[GetValuesResult] = {
       val reusableGetValuesResult = new GetValuesResult()
 
       new NextIterator[GetValuesResult] {
         private val iter = stateStore.prefixScanWithMultiValues(key, colFamilyName)
 
         private var currentTs = -1L
+        private var pastUpperBound = false
         private val valueAndMatchPairs = scala.collection.mutable.ArrayBuffer[ValueAndMatchPair]()
 
+        private def flushAccumulated(): GetValuesResult = {
+          if (valueAndMatchPairs.nonEmpty) {
+            val result = reusableGetValuesResult.withNew(
+              currentTs, valueAndMatchPairs.toList)
+            currentTs = -1L
+            valueAndMatchPairs.clear()
+            result
+          } else {
+            finished = true
+            null
+          }
+        }
+
         @tailrec
         override protected def getNext(): GetValuesResult = {
-          if (iter.hasNext) {
+          if (pastUpperBound || !iter.hasNext) {
+            flushAccumulated()
+          } else {
             val unsafeRowPair = iter.next()
-
             val ts = TimestampKeyStateEncoder.extractTimestamp(unsafeRowPair.key)
 
-            if (currentTs == -1L) {
-              // First time
+            if (ts > maxTs) {
+              pastUpperBound = true
+              getNext()
+            } else if (ts < minTs) {
+              getNext()
+            } else if (currentTs == -1L || currentTs == ts) {
               currentTs = ts
-            }
-
-            if (currentTs != ts) {
-              assert(valueAndMatchPairs.nonEmpty,
-                "timestamp has changed but no values collected from previous timestamp! " +
-                s"This should not happen. currentTs: $currentTs, new ts: $ts")
-
-              // Return previous batch
-              val result = reusableGetValuesResult.withNew(
-                currentTs, valueAndMatchPairs.toSeq)
+              valueAndMatchPairs += valueRowConverter.convertValue(unsafeRowPair.value)
+              getNext()
+            } else {
+              // Timestamp changed -- flush previous group before starting new one
+              val prevTs = currentTs
+              val prevValues = valueAndMatchPairs.toList
 
-              // Reset for new timestamp
               currentTs = ts
               valueAndMatchPairs.clear()
+              valueAndMatchPairs += valueRowConverter.convertValue(unsafeRowPair.value)
 
-              // Add current value
-              val value = valueRowConverter.convertValue(unsafeRowPair.value)
-              valueAndMatchPairs += value
-              result
-            } else {
-              // Same timestamp, accumulate values
-              val value = valueRowConverter.convertValue(unsafeRowPair.value)
-              valueAndMatchPairs += value
-
-              // Continue to next
-              getNext()
-            }
-          } else {
-            if (currentTs != -1L) {
-              assert(valueAndMatchPairs.nonEmpty)
-
-              // Return last batch
-              val result = reusableGetValuesResult.withNew(
-                currentTs, valueAndMatchPairs.toSeq)
-
-              // Mark as finished
-              currentTs = -1L
-              valueAndMatchPairs.clear()
-              result
-            } else {
-              finished = true
-              null
+              reusableGetValuesResult.withNew(prevTs, prevValues)
             }
           }
         }
@@ -1051,7 +1054,8 @@ abstract class SymmetricHashJoinStateManagerBase(
   def getJoinedRows(
       key: UnsafeRow,
       generateJoinedRow: InternalRow => JoinedRow,
-      predicate: JoinedRow => Boolean): Iterator[JoinedRow] = {
+      predicate: JoinedRow => Boolean,
+      timestampRange: Option[(Long, Long)] = None): Iterator[JoinedRow] = {
     val numValues = keyToNumValues.get(key)
     keyWithIndexToValue.getAll(key, numValues).map { keyIdxToValue =>
       val joinedRow = generateJoinedRow(keyIdxToValue.value)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
@@ -1009,4 +1009,55 @@ class SymmetricHashJoinStateManagerEventTimeInValueSuite
       }
     }
   }
+
+  // NOTE: In practice, the predicate should contain the condition matching timestampRange.
+  // Here we intentionally use a pass-all predicate to test timestampRange filtering directly.
+  private def getJoinedRowTimestamps(
+      key: Int,
+      range: Option[(Long, Long)])(implicit manager: SymmetricHashJoinStateManager): Seq[Int] = {
+    val dummyRow = new GenericInternalRow(0)
+    manager.getJoinedRows(
+      toJoinKeyRow(key),
+      row => new JoinedRow(row, dummyRow),
+      _ => true,
+      timestampRange = range
+    ).map(_.getInt(1)).toSeq.sorted
+  }
+
+  test("StreamingJoinStateManager V4 - getJoinedRows with timestampRange") {
+    withJoinStateManager(
+      inputValueAttributes, joinKeyExpressions, stateFormatVersion = 4) { manager =>
+      implicit val mgr = manager
+
+      Seq(10, 20, 30, 40, 50).foreach(append(40, _))
+
+      assert(getJoinedRowTimestamps(40, Some((20L, 40L))) === Seq(20, 30, 40))
+      assert(getJoinedRowTimestamps(40, Some((20L, 20L))) === Seq(20))
+      assert(getJoinedRowTimestamps(40, Some((25L, 35L))) === Seq(30))
+      assert(getJoinedRowTimestamps(40, Some((0L, 100L))) === Seq(10, 20, 30, 40, 50))
+      assert(getJoinedRowTimestamps(40, Some((10L, 30L))) === Seq(10, 20, 30))
+      assert(getJoinedRowTimestamps(40, Some((50L, 100L))) === Seq(50))
+      assert(getJoinedRowTimestamps(40, Some((60L, 100L))) === Seq.empty)
+      assert(getJoinedRowTimestamps(40, Some((0L, 5L))) === Seq.empty)
+      assert(getJoinedRowTimestamps(40, None) === Seq(10, 20, 30, 40, 50))
+    }
+  }
+
+  test("StreamingJoinStateManager V4 - timestampRange with multiple values per timestamp") {
+    withJoinStateManager(
+      inputValueAttributes, joinKeyExpressions, stateFormatVersion = 4) { manager =>
+      implicit val mgr = manager
+
+      append(40, 10)
+      append(40, 10)
+      append(40, 20)
+      append(40, 20)
+      append(40, 20)
+      append(40, 30)
+
+      assert(getJoinedRowTimestamps(40, Some((20L, 20L))) === Seq(20, 20, 20))
+      assert(getJoinedRowTimestamps(40, Some((10L, 20L))) === Seq(10, 10, 20, 20, 20))
+      assert(getJoinedRowTimestamps(40, Some((10L, 30L))) === Seq(10, 10, 20, 20, 20, 30))
+    }
+  }
 }