apache
diff --git a/‎python/pyspark/sql/streaming/list_state_client.py‎
Lines changed: 69 additions & 12 deletions b/‎python/pyspark/sql/streaming/list_state_client.py‎
Lines changed: 69 additions & 12 deletions
diff --git a/‎python/pyspark/sql/streaming/proto/StateMessage_pb2.py‎
Lines changed: 79 additions & 77 deletions b/‎python/pyspark/sql/streaming/proto/StateMessage_pb2.py‎
Lines changed: 79 additions & 77 deletions
diff --git a/‎python/pyspark/sql/streaming/proto/StateMessage_pb2.pyi‎
Lines changed: 72 additions & 0 deletions b/‎python/pyspark/sql/streaming/proto/StateMessage_pb2.pyi‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎python/pyspark/sql/streaming/stateful_processor_api_client.py‎
Lines changed: 12 additions & 0 deletions b/‎python/pyspark/sql/streaming/stateful_processor_api_client.py‎
Lines changed: 12 additions & 0 deletions
@@ -37,7 +37,7 @@ def __init__(
             self.schema = schema
         # A dictionary to store the mapping between list state name and a tuple of data batch
         # and the index of the last row that was read.
-        self.data_batch_dict: Dict[str, Tuple[Any, int]] = {}
+        self.data_batch_dict: Dict[str, Tuple[Any, int, bool]] = {}
 
     def exists(self, state_name: str) -> bool:
         import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage
@@ -61,12 +61,12 @@ def exists(self, state_name: str) -> bool:
                 f"Error checking value state exists: " f"{response_message[1]}"
             )
 
-    def get(self, state_name: str, iterator_id: str) -> Tuple:
+    def get(self, state_name: str, iterator_id: str) -> Tuple[Tuple, bool]:
         import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage
 
         if iterator_id in self.data_batch_dict:
             # If the state is already in the dictionary, return the next row.
-            data_batch, index = self.data_batch_dict[iterator_id]
+            data_batch, index, require_next_fetch = self.data_batch_dict[iterator_id]
         else:
             # If the state is not in the dictionary, fetch the state from the server.
             get_call = stateMessage.ListStateGet(iteratorId=iterator_id)
@@ -79,23 +79,35 @@ def get(self, state_name: str, iterator_id: str) -> Tuple:
             message = stateMessage.StateRequest(stateVariableRequest=state_variable_request)
 
             self._stateful_processor_api_client._send_proto_message(message.SerializeToString())
-            response_message = self._stateful_processor_api_client._receive_proto_message()
+            response_message = (
+                self._stateful_processor_api_client._receive_proto_message_with_list_get()
+            )
             status = response_message[0]
             if status == 0:
-                data_batch = self._stateful_processor_api_client._read_list_state()
+                data_batch = list(
+                    map(
+                        lambda x: self._stateful_processor_api_client._deserialize_from_bytes(x),
+                        response_message[2],
+                    )
+                )
+                require_next_fetch = response_message[3]
                 index = 0
             else:
                 raise StopIteration()
 
+        is_last_row = False
         new_index = index + 1
         if new_index < len(data_batch):
             # Update the index in the dictionary.
-            self.data_batch_dict[iterator_id] = (data_batch, new_index)
+            self.data_batch_dict[iterator_id] = (data_batch, new_index, require_next_fetch)
         else:
             # If the index is at the end of the data batch, remove the state from the dictionary.
             self.data_batch_dict.pop(iterator_id, None)
+            is_last_row = True
+
+        is_last_row_from_iterator = is_last_row and not require_next_fetch
         row = data_batch[index]
-        return tuple(row)
+        return (tuple(row), is_last_row_from_iterator)
 
     def append_value(self, state_name: str, value: Tuple) -> None:
         import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage
@@ -118,7 +130,24 @@ def append_value(self, state_name: str, value: Tuple) -> None:
     def append_list(self, state_name: str, values: List[Tuple]) -> None:
         import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage
 
-        append_list_call = stateMessage.AppendList()
+        send_data_via_arrow = False
+
+        # To workaround mypy type assignment check.
+        values_as_bytes: Any = []
+        if len(values) == 100:
+            # TODO(SPARK-51907): Let's update this to be either flexible or more reasonable default
+            #  value backed by various benchmarks.
+            # Arrow codepath
+            send_data_via_arrow = True
+        else:
+            values_as_bytes = map(
+                lambda x: self._stateful_processor_api_client._serialize_to_bytes(self.schema, x),
+                values,
+            )
+
+        append_list_call = stateMessage.AppendList(
+            value=values_as_bytes, fetchWithArrow=send_data_via_arrow
+        )
         list_state_call = stateMessage.ListStateCall(
             stateName=state_name, appendList=append_list_call
         )
@@ -127,7 +156,9 @@ def append_list(self, state_name: str, values: List[Tuple]) -> None:
 
         self._stateful_processor_api_client._send_proto_message(message.SerializeToString())
 
-        self._stateful_processor_api_client._send_list_state(self.schema, values)
+        if send_data_via_arrow:
+            self._stateful_processor_api_client._send_arrow_state(self.schema, values)
+
         response_message = self._stateful_processor_api_client._receive_proto_message()
         status = response_message[0]
         if status != 0:
@@ -137,14 +168,32 @@ def append_list(self, state_name: str, values: List[Tuple]) -> None:
     def put(self, state_name: str, values: List[Tuple]) -> None:
         import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage
 
-        put_call = stateMessage.ListStatePut()
+        send_data_via_arrow = False
+        # To workaround mypy type assignment check.
+        values_as_bytes: Any = []
+        if len(values) == 100:
+            # TODO(SPARK-51907): Let's update this to be either flexible or more reasonable default
+            #  value backed by various benchmarks.
+            send_data_via_arrow = True
+        else:
+            values_as_bytes = map(
+                lambda x: self._stateful_processor_api_client._serialize_to_bytes(self.schema, x),
+                values,
+            )
+
+        put_call = stateMessage.ListStatePut(
+            value=values_as_bytes, fetchWithArrow=send_data_via_arrow
+        )
+
         list_state_call = stateMessage.ListStateCall(stateName=state_name, listStatePut=put_call)
         state_variable_request = stateMessage.StateVariableRequest(listStateCall=list_state_call)
         message = stateMessage.StateRequest(stateVariableRequest=state_variable_request)
 
         self._stateful_processor_api_client._send_proto_message(message.SerializeToString())
 
-        self._stateful_processor_api_client._send_list_state(self.schema, values)
+        if send_data_via_arrow:
+            self._stateful_processor_api_client._send_arrow_state(self.schema, values)
+
         response_message = self._stateful_processor_api_client._receive_proto_message()
         status = response_message[0]
         if status != 0:
@@ -174,9 +223,17 @@ def __init__(self, list_state_client: ListStateClient, state_name: str):
         # Generate a unique identifier for the iterator to make sure iterators from the same
         # list state do not interfere with each other.
         self.iterator_id = str(uuid.uuid4())
+        self.iterator_fully_consumed = False
 
     def __iter__(self) -> Iterator[Tuple]:
         return self
 
     def __next__(self) -> Tuple:
-        return self.list_state_client.get(self.state_name, self.iterator_id)
+        if self.iterator_fully_consumed:
+            raise StopIteration()
+
+        row, is_last_row = self.list_state_client.get(self.state_name, self.iterator_id)
+        if is_last_row:
+            self.iterator_fully_consumed = True
+
+        return row
@@ -34,7 +34,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 import builtins
+import collections.abc
 import google.protobuf.descriptor
+import google.protobuf.internal.containers
 import google.protobuf.internal.enum_type_wrapper
 import google.protobuf.message
 import sys
@@ -229,6 +231,44 @@ class StateResponseWithStringTypeVal(google.protobuf.message.Message):
 
 global___StateResponseWithStringTypeVal = StateResponseWithStringTypeVal
 
+class StateResponseWithListGet(google.protobuf.message.Message):
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    STATUSCODE_FIELD_NUMBER: builtins.int
+    ERRORMESSAGE_FIELD_NUMBER: builtins.int
+    VALUE_FIELD_NUMBER: builtins.int
+    REQUIRENEXTFETCH_FIELD_NUMBER: builtins.int
+    statusCode: builtins.int
+    errorMessage: builtins.str
+    @property
+    def value(
+        self,
+    ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bytes]: ...
+    requireNextFetch: builtins.bool
+    def __init__(
+        self,
+        *,
+        statusCode: builtins.int = ...,
+        errorMessage: builtins.str = ...,
+        value: collections.abc.Iterable[builtins.bytes] | None = ...,
+        requireNextFetch: builtins.bool = ...,
+    ) -> None: ...
+    def ClearField(
+        self,
+        field_name: typing_extensions.Literal[
+            "errorMessage",
+            b"errorMessage",
+            "requireNextFetch",
+            b"requireNextFetch",
+            "statusCode",
+            b"statusCode",
+            "value",
+            b"value",
+        ],
+    ) -> None: ...
+
+global___StateResponseWithListGet = StateResponseWithListGet
+
 class StatefulProcessorCall(google.protobuf.message.Message):
     DESCRIPTOR: google.protobuf.descriptor.Descriptor
 
@@ -1042,8 +1082,24 @@ global___ListStateGet = ListStateGet
 class ListStatePut(google.protobuf.message.Message):
     DESCRIPTOR: google.protobuf.descriptor.Descriptor
 
+    VALUE_FIELD_NUMBER: builtins.int
+    FETCHWITHARROW_FIELD_NUMBER: builtins.int
+    @property
+    def value(
+        self,
+    ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bytes]: ...
+    fetchWithArrow: builtins.bool
     def __init__(
         self,
+        *,
+        value: collections.abc.Iterable[builtins.bytes] | None = ...,
+        fetchWithArrow: builtins.bool = ...,
+    ) -> None: ...
+    def ClearField(
+        self,
+        field_name: typing_extensions.Literal[
+            "fetchWithArrow", b"fetchWithArrow", "value", b"value"
+        ],
     ) -> None: ...
 
 global___ListStatePut = ListStatePut
@@ -1065,8 +1121,24 @@ global___AppendValue = AppendValue
 class AppendList(google.protobuf.message.Message):
     DESCRIPTOR: google.protobuf.descriptor.Descriptor
 
+    VALUE_FIELD_NUMBER: builtins.int
+    FETCHWITHARROW_FIELD_NUMBER: builtins.int
+    @property
+    def value(
+        self,
+    ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bytes]: ...
+    fetchWithArrow: builtins.bool
     def __init__(
         self,
+        *,
+        value: collections.abc.Iterable[builtins.bytes] | None = ...,
+        fetchWithArrow: builtins.bool = ...,
+    ) -> None: ...
+    def ClearField(
+        self,
+        field_name: typing_extensions.Literal[
+            "fetchWithArrow", b"fetchWithArrow", "value", b"value"
+        ],
     ) -> None: ...
 
 global___AppendList = AppendList
 
@@ -425,6 +425,18 @@ def _receive_proto_message_with_string_value(self) -> Tuple[int, str, str]:
         message.ParseFromString(bytes)
         return message.statusCode, message.errorMessage, message.value
 
+    # The third return type is RepeatedScalarFieldContainer[bytes], which is protobuf's container
+    # type. We simplify it to Any here to avoid unnecessary complexity.
+    def _receive_proto_message_with_list_get(self) -> Tuple[int, str, Any, bool]:
+        import pyspark.sql.streaming.proto.StateMessage_pb2 as stateMessage
+
+        length = read_int(self.sockfile)
+        bytes = self.sockfile.read(length)
+        message = stateMessage.StateResponseWithListGet()
+        message.ParseFromString(bytes)
+
+        return message.statusCode, message.errorMessage, message.value, message.requireNextFetch
+
     def _receive_str(self) -> str:
         return self.utf8_deserializer.loads(self.sockfile)