apache
diff --git a/‎python/pyspark/sql/datasource.py‎
Lines changed: 75 additions & 4 deletions b/‎python/pyspark/sql/datasource.py‎
Lines changed: 75 additions & 4 deletions
diff --git a/‎python/pyspark/sql/datasource_internal.py‎
Lines changed: 56 additions & 28 deletions b/‎python/pyspark/sql/datasource_internal.py‎
Lines changed: 56 additions & 28 deletions
diff --git a/‎python/pyspark/sql/streaming/datasource.py‎
Lines changed: 119 additions & 0 deletions b/‎python/pyspark/sql/streaming/datasource.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎python/pyspark/sql/streaming/listener.py‎
Lines changed: 11 additions & 3 deletions b/‎python/pyspark/sql/streaming/listener.py‎
Lines changed: 11 additions & 3 deletions
@@ -32,6 +32,7 @@
 )
 
 from pyspark.sql import Row
+from pyspark.sql.streaming.datasource import ReadAllAvailable, ReadLimit
 from pyspark.sql.types import StructType
 from pyspark.errors import PySparkNotImplementedError
 
@@ -714,9 +715,35 @@ def initialOffset(self) -> dict:
             messageParameters={"feature": "initialOffset"},
         )
 
-    def latestOffset(self) -> dict:
+    def latestOffset(self, start: dict, limit: ReadLimit) -> dict:
         """
-        Returns the most recent offset available.
+        Returns the most recent offset available given a read limit. The start offset can be used
+        to figure out how much new data should be read given the limit.
+
+        The `start` will be provided from the return value of :meth:`initialOffset()` for
+        the very first micro-batch, and for subsequent micro-batches, the start offset is the
+        ending offset from the previous micro-batch. The source can return the `start` parameter
+        as it is, if there is no data to process.
+
+        :class:`ReadLimit` can be used by the source to limit the amount of data returned in this
+        call. The implementation should implement :meth:`getDefaultReadLimit()` to provide the
+        proper :class:`ReadLimit` if the source can limit the amount of data returned based on the
+        source options.
+
+        The engine can still call :meth:`latestOffset()` with :class:`ReadAllAvailable` even if the
+        source produces the different read limit from :meth:`getDefaultReadLimit()`, to respect the
+        semantic of trigger. The source must always respect the given readLimit provided by the
+        engine; e.g. if the readLimit is :class:`ReadAllAvailable`, the source must ignore the read
+        limit configured through options.
+
+        .. versionadded:: 4.2.0
+
+        Parameters
+        ----------
+        start : dict
+            The start offset of the microbatch to continue reading from.
+        limit : :class:`ReadLimit`
+            The limit on the amount of data to be returned by this call.
 
         Returns
         -------
@@ -726,14 +753,58 @@ def latestOffset(self) -> dict:
 
         Examples
         --------
-        >>> def latestOffset(self):
-        ...     return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}
+        >>> from pyspark.sql.streaming.datasource import ReadAllAvailable, ReadMaxRows
+        >>> def latestOffset(self, start, limit):
+        ...     # Assume the source has 10 new records between start and latest offset
+        ...     if isinstance(limit, ReadAllAvailable):
+        ...        return {"index": start["index"] + 10}
+        ...     else:  # e.g., limit is ReadMaxRows(5)
+        ...        return {"index": start["index"] + min(10, limit.maxRows)}
         """
+        # NOTE: Previous Spark versions didn't have start offset and read limit parameters for this
+        # method. While Spark will ensure the backward compatibility for existing data sources, the
+        # new data sources are strongly encouraged to implement this new method signature.
         raise PySparkNotImplementedError(
             errorClass="NOT_IMPLEMENTED",
             messageParameters={"feature": "latestOffset"},
         )
 
+    def getDefaultReadLimit(self) -> ReadLimit:
+        """
+        Returns the read limits potentially passed to the data source through options when creating
+        the data source. See the built-in implementations of :class:`ReadLimit` for available read
+        limits.
+
+        Implementing this method is optional. By default, it returns :class:`ReadAllAvailable`,
+        which means there is no limit on the amount of data returned by :meth:`latestOffset()`.
+
+        .. versionadded:: 4.2.0
+        """
+        return ReadAllAvailable()
+
+    def reportLatestOffset(self) -> Optional[dict]:
+        """
+        Returns the most recent offset available. The information is used to report the latest
+        offset in the streaming query status.
+        The source can return `None`, if there is no data to process or the source does not support
+        to this method.
+
+        .. versionadded:: 4.2.0
+
+        Returns
+        -------
+        dict or None
+            A dict or recursive dict whose key and value are primitive types, which includes
+            Integer, String and Boolean.
+            Returns `None` if the source does not support reporting latest offset.
+
+        Examples
+        --------
+        >>> def reportLatestOffset(self):
+        ...     return {"partition-1": {"index": 100}, "partition-2": {"index": 200}}
+        """
+        return None
+
     def partitions(self, start: dict, end: dict) -> Sequence[InputPartition]:
         """
         Returns a list of InputPartition given the start and end offsets. Each InputPartition
 
@@ -19,16 +19,25 @@
 import json
 import copy
 from itertools import chain
-from typing import Iterator, List, Optional, Sequence, Tuple
+from typing import Iterator, List, Sequence, Tuple, Type, Dict
 
 from pyspark.sql.datasource import (
     DataSource,
     DataSourceStreamReader,
     InputPartition,
     SimpleDataSourceStreamReader,
 )
+from pyspark.sql.streaming.datasource import (
+    ReadAllAvailable,
+    ReadLimit,
+    ReadMaxBytes,
+    ReadMaxRows,
+    ReadMinRows,
+    ReadMaxFiles,
+)
 from pyspark.sql.types import StructType
 from pyspark.errors import PySparkNotImplementedError
+from pyspark.errors.exceptions.base import PySparkException
 
 
 def _streamReader(datasource: DataSource, schema: StructType) -> "DataSourceStreamReader":
@@ -62,13 +71,9 @@ class _SimpleStreamReaderWrapper(DataSourceStreamReader):
     so that :class:`SimpleDataSourceStreamReader` can integrate with streaming engine like an
     ordinary :class:`DataSourceStreamReader`.
 
-    current_offset tracks the latest progress of the record prefetching, it is initialized to be
-    initialOffset() when query start for the first time or initialized to be the end offset of
-    the last planned batch when query restarts.
-
     When streaming engine calls latestOffset(), the wrapper calls read() that starts from
-    current_offset, prefetches and cache the data, then updates the current_offset to be
-    the end offset of the new data.
+    start provided via the parameter of latestOffset(), prefetches and cache the data, then updates
+    the current_offset to be the end offset of the new data.
 
     When streaming engine call planInputPartitions(start, end), the wrapper get the prefetched data
     from cache and send it to JVM along with the input partitions.
@@ -79,28 +84,26 @@ class _SimpleStreamReaderWrapper(DataSourceStreamReader):
 
     def __init__(self, simple_reader: SimpleDataSourceStreamReader):
         self.simple_reader = simple_reader
-        self.initial_offset: Optional[dict] = None
-        self.current_offset: Optional[dict] = None
         self.cache: List[PrefetchedCacheEntry] = []
 
     def initialOffset(self) -> dict:
-        if self.initial_offset is None:
-            self.initial_offset = self.simple_reader.initialOffset()
-        return self.initial_offset
-
-    def latestOffset(self) -> dict:
-        # when query start for the first time, use initial offset as the start offset.
-        if self.current_offset is None:
-            self.current_offset = self.initialOffset()
-        (iter, end) = self.simple_reader.read(self.current_offset)
-        self.cache.append(PrefetchedCacheEntry(self.current_offset, end, iter))
-        self.current_offset = end
+        return self.simple_reader.initialOffset()
+
+    def getDefaultReadLimit(self) -> ReadLimit:
+        # We do not consider providing different read limit on simple stream reader.
+        return ReadAllAvailable()
+
+    def latestOffset(self, start: dict, limit: ReadLimit) -> dict:
+        assert start is not None, "start offset should not be None"
+        assert isinstance(
+            limit, ReadAllAvailable
+        ), "simple stream reader does not support read limit"
+
+        (iter, end) = self.simple_reader.read(start)
+        self.cache.append(PrefetchedCacheEntry(start, end, iter))
         return end
 
     def commit(self, end: dict) -> None:
-        if self.current_offset is None:
-            self.current_offset = end
-
         end_idx = -1
         for idx, entry in enumerate(self.cache):
             if json.dumps(entry.end) == json.dumps(end):
@@ -112,11 +115,6 @@ def commit(self, end: dict) -> None:
         self.simple_reader.commit(end)
 
     def partitions(self, start: dict, end: dict) -> Sequence["InputPartition"]:
-        # when query restart from checkpoint, use the last committed offset as the start offset.
-        # This depends on the streaming engine calling planInputPartitions() of the last batch
-        # in offset log when query restart.
-        if self.current_offset is None:
-            self.current_offset = end
         if len(self.cache) > 0:
             assert self.cache[-1].end == end
         return [SimpleInputPartition(start, end)]
@@ -144,3 +142,33 @@ def read(
         self, input_partition: SimpleInputPartition  # type: ignore[override]
     ) -> Iterator[Tuple]:
         return self.simple_reader.readBetweenOffsets(input_partition.start, input_partition.end)
+
+
+class ReadLimitRegistry:
+    def __init__(self) -> None:
+        self._registry: Dict[str, Type[ReadLimit]] = {}
+        # Register built-in ReadLimit types
+        self.__register(ReadAllAvailable)
+        self.__register(ReadMinRows)
+        self.__register(ReadMaxRows)
+        self.__register(ReadMaxFiles)
+        self.__register(ReadMaxBytes)
+
+    def __register(self, read_limit_type: Type["ReadLimit"]) -> None:
+        name = read_limit_type.__name__
+        if name in self._registry:
+            raise PySparkException(f"ReadLimit type '{name}' is already registered.")
+        self._registry[name] = read_limit_type
+
+    def get(self, params_with_type: dict) -> ReadLimit:
+        type_name = params_with_type["_type"]
+        if type_name is None:
+            raise PySparkException("ReadLimit type name is missing.")
+
+        read_limit_type = self._registry.get(type_name)
+        if read_limit_type is None:
+            raise PySparkException("name '{}' is not registered.".format(type_name))
+
+        params_without_type = params_with_type.copy()
+        del params_without_type["_type"]
+        return read_limit_type(**params_without_type)
@@ -0,0 +1,119 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+
+class ReadLimit:
+    """
+    Specifies limits on how much data to read from a streaming source when
+    determining the latest offset.
+
+    As of Spark 4.2.0, only built-in implementations of :class:`ReadLimit` are supported. Please
+    refer to the following classes for the supported types:
+
+    - :class:`ReadAllAvailable`
+    - :class:`ReadMinRows`
+    - :class:`ReadMaxRows`
+    - :class:`ReadMaxFiles`
+    - :class:`ReadMaxBytes`
+    """
+
+
+@dataclass
+class ReadAllAvailable(ReadLimit):
+    """
+    A :class:`ReadLimit` that indicates to read all available data, regardless of the given source
+    options.
+    """
+
+
+@dataclass
+class ReadMinRows(ReadLimit):
+    """
+    A :class:`ReadLimit` that indicates to read minimum N rows. If there is less than N rows
+    available for read, the source should skip producing a new offset to read and wait until more
+    data arrives.
+
+    Note that the semantic does not work properly with Trigger.AvailableNow since the source
+    may end up waiting forever for more data to arrive. It is the source's responsibility to
+    handle this case properly.
+    """
+
+    min_rows: int
+
+
+@dataclass
+class ReadMaxRows(ReadLimit):
+    """
+    A :class:`ReadLimit` that indicates to read maximum N rows. The source should not read more
+    than N rows when determining the latest offset.
+    """
+
+    max_rows: int
+
+
+@dataclass
+class ReadMaxFiles(ReadLimit):
+    """
+    A :class:`ReadLimit` that indicates to read maximum N files. The source should not read more
+    than N files when determining the latest offset.
+    """
+
+    max_files: int
+
+
+@dataclass
+class ReadMaxBytes(ReadLimit):
+    """
+    A :class:`ReadLimit` that indicates to read maximum N bytes. The source should not read more
+    than N bytes when determining the latest offset.
+    """
+
+    max_bytes: int
+
+
+class SupportsTriggerAvailableNow(ABC):
+    """
+    A mixin interface for streaming sources that support Trigger.AvailableNow. This interface can
+    be added to both :class:`DataSourceStreamReader` and :class:`SimpleDataSourceStreamReader`.
+    """
+
+    @abstractmethod
+    def prepareForTriggerAvailableNow(self) -> None:
+        """
+        This will be called at the beginning of streaming queries with Trigger.AvailableNow, to let
+        the source record the offset for the current latest data at the time (a.k.a the target
+        offset for the query). The source must behave as if there is no new data coming in after
+        the target offset, i.e., the source must not return an offset higher than the target offset
+        when :meth:`DataSourceStreamReader.latestOffset()` is called.
+
+        The source can extend the semantic of "current latest data" based on its own logic, but the
+        extended semantic must not violate the expectation that the source will not read any data
+        which is added later than the time this method has called.
+
+        Note that it is the source's responsibility to ensure that calling
+        :meth:`DataSourceStreamReader.latestOffset()` or :meth:`SimpleDataSourceStreamReader.read()`
+        after calling this method will eventually reach the target offset, and finally returns the
+        same offset as given start parameter, to indicate that there is no more data to read. This
+        includes the case where the query is restarted and the source is asked to read from the
+        offset being journaled in previous run - source should take care of exceptional cases like
+        new partition has added during the restart, etc, to ensure that the query run will be
+        completed at some point.
+        """
+        pass
@@ -914,12 +914,20 @@ def fromJObject(cls, jprogress: "JavaObject") -> "SourceProgress":
 
     @classmethod
     def fromJson(cls, j: Dict[str, Any]) -> "SourceProgress":
+        def _to_json_string(value: Any) -> str:
+            """Convert offset value to JSON string. If already a string, return as-is.
+            If a dict/list, JSON-encode it."""
+            if isinstance(value, str):
+                return value
+            else:
+                return json.dumps(value)
+
         return cls(
             jdict=j,
             description=j["description"],
-            startOffset=str(j["startOffset"]),
-            endOffset=str(j["endOffset"]),
-            latestOffset=str(j["latestOffset"]),
+            startOffset=_to_json_string(j["startOffset"]),
+            endOffset=_to_json_string(j["endOffset"]),
+            latestOffset=_to_json_string(j["latestOffset"]),
             numInputRows=j["numInputRows"],
             inputRowsPerSecond=j["inputRowsPerSecond"],
             processedRowsPerSecond=j["processedRowsPerSecond"],