chore: pr feedback

ion-elgreco · ion-elgreco · commit 888b03f7d781 · 2025-05-24T19:55:57.000Z
Signed-off-by: Ion Koutsouris &lt;15728914+ion-elgreco@users.noreply.github.com&gt;
diff --git a/.github/workflows/python_benchmark.yml b/.github/workflows/python_benchmark.yml
@@ -39,7 +39,7 @@ jobs:
 
       - name: Run benchmark
         run: |
-          uv run pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json
+          uv run pytest tests/test_benchmark.py -m '(benchmark and pyarrow)' --benchmark-json output.json
 
       - name: Store benchmark result
         uses: benchmark-action/github-action-benchmark@v1
@@ -48,4 +48,3 @@ jobs:
           output-file-path: python/output.json
           external-data-json-path: ./cache/benchmark-data.json
           fail-on-alert: true
-
diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml
@@ -49,7 +49,7 @@ jobs:
         run: make unit-test
 
   test:
-    name: Python Build (Python 3.10 Optional latest pyarrow)
+    name: Python Build (Python 3.10 PyArrow latest)
     runs-on: ubuntu-latest
     env:
       SCCACHE_GHA_ENABLED: "true"
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -17,7 +17,7 @@ doc = false
 [dependencies]
 delta_kernel.workspace = true
 
-pyo3-arrow = { version = "*", default-features = false}
+pyo3-arrow = { version = "0.9.0", default-features = false}
 
 # arrow
 arrow-schema = { workspace = true, features = ["serde"] }
diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi
@@ -404,13 +404,12 @@ class PrimitiveType:
 
     @staticmethod
     def from_arrow(type: ArrowSchemaExportable) -> PrimitiveType:
-        """Create a PrimitiveType from an object with an `__arrow_c_schema__`
-        datatype
+        """Create a PrimitiveType from an `ArrowSchemaExportable` datatype
 
         Will raise `TypeError` if the arrow type is not a primitive type.
 
         Args:
-            type: an object with an `__arrow_c_schema__
+            type: an object that is `ArrowSchemaExportable`
 
         Returns:
             a PrimitiveType
@@ -494,13 +493,12 @@ class ArrayType:
         """Get the equivalent arro3 type."""
     @staticmethod
     def from_arrow(type: ArrowSchemaExportable) -> ArrayType:
-        """Create an ArrayType from an object with an `__arrow_c_schema__`
-        datatype.
+        """Create an ArrayType from an `ArrowSchemaExportable` datatype.
 
         Will raise `TypeError` if a different arrow DataType is provided.
 
         Args:
-            type: an object with an `__arrow_c_schema__
+            type: an object that is `ArrowSchemaExportable`
 
         Returns:
             an ArrayType
@@ -602,13 +600,12 @@ class MapType:
 
     @staticmethod
     def from_arrow(type: ArrowSchemaExportable) -> MapType:
-        """Create a MapType from an object with an `__arrow_c_schema__`
-        datatype.
+        """Create a MapType from an `ArrowSchemaExportable` datatype
 
         Will raise `TypeError` if passed a different type.
 
         Args:
-            type: an object with an `__arrow_c_schema__
+            type: an object that is `ArrowSchemaExportable`
 
         Returns:
             a MapType
@@ -716,13 +713,12 @@ class Field:
         """
     @staticmethod
     def from_arrow(field: ArrowSchemaExportable) -> Field:
-        """Create a Field from an object with an `__arrow_c_schema__`
-        field
+        """Create a Field from an object with an `ArrowSchemaExportable` field
 
         Note: This currently doesn't preserve field metadata.
 
         Args:
-            field: a Field object with an `__arrow_c_schema__
+            field: a Field object that is `ArrowSchemaExportable`
 
         Returns:
             a Field
@@ -793,13 +789,12 @@ class StructType:
 
     @staticmethod
     def from_arrow(type: ArrowSchemaExportable) -> StructType:
-        """Create a new StructType from an object with an `__arrow_c_schema__`
-        datatype.
+        """Create a new StructType from an `ArrowSchemaExportable` datatype
 
         Will raise `TypeError` if a different data type is provided.
 
         Args:
-            type: a struct type object with an `__arrow_c_schema__
+            type: a struct type object that is `ArrowSchemaExportable`
 
         Returns:
             a StructType
@@ -872,7 +867,7 @@ class Schema:
         Will raise `TypeError` if one of the Arrow type is not a primitive type.
 
         Args:
-            type: an object with an `__arrow_c_schema__
+            type: an object that is `ArrowSchemaExportable`
 
         Returns:
             a Schema
diff --git a/python/deltalake/fs/_base_handler.py b/python/deltalake/fs/_base_handler.py
@@ -32,7 +32,7 @@ def from_table(
         table: RawDeltaTable,
         options: dict[str, str] | None = None,
         known_sizes: dict[str, int] | None = None,
-    ) -> "BaseDeltaStorageHandler":
+    ) -> BaseDeltaStorageHandler:
         self = cls.__new__(cls)
         self._handler = DeltaFileSystemHandler.from_table(table, options, known_sizes)
         return self
diff --git a/python/tests/test_benchmark.py b/python/tests/test_benchmark.py
@@ -0,0 +1,117 @@
+import os
+
+import pytest
+from arro3.core import Array, ChunkedArray, DataType, Table
+from numpy.random import standard_normal
+
+from deltalake import DeltaTable, QueryBuilder, write_deltalake
+
+# NOTE: make sure to run these in release mode with
+# MATURIN_EXTRA_ARGS=--release make develop
+# When profiling, use:
+# MATURIN_EXTRA_ARGS="--profile release-with-debug" make develop
+
+
+@pytest.fixture()
+def sample_table() -> Table:
+    max_size_bytes = 128 * 1024 * 1024
+    ncols = 20
+    nrows = max_size_bytes // 20 // 8
+    tab = Table.from_pydict({f"x{i}": standard_normal(nrows) for i in range(ncols)})
+    # Add index column for sorting
+    tab = tab.append_column(
+        "i", ChunkedArray(Array(range(nrows), type=DataType.int64()))
+    )
+    return tab
+
+
+@pytest.mark.benchmark(group="write")
+def test_benchmark_write(benchmark, sample_table, tmp_path):
+    benchmark(write_deltalake, str(tmp_path), sample_table, mode="overwrite")
+
+    dt = DeltaTable(str(tmp_path))
+    assert (
+        QueryBuilder().register("tbl", dt).execute("select * from tbl order by id")
+        == sample_table
+    )
+
+
+@pytest.mark.pyarrow
+@pytest.mark.benchmark(group="read")
+def test_benchmark_read(benchmark, sample_table, tmp_path):
+    import pyarrow as pa
+
+    write_deltalake(str(tmp_path), sample_table)
+    dt = DeltaTable(str(tmp_path))
+
+    result = benchmark(dt.to_pyarrow_table)
+    assert result.sort_by("i") == pa.table(sample_table)
+
+
+@pytest.mark.pyarrow
+@pytest.mark.benchmark(group="read")
+def test_benchmark_read_pyarrow(benchmark, sample_table, tmp_path):
+    import pyarrow as pa
+    import pyarrow.fs as pa_fs
+
+    write_deltalake(str(tmp_path), sample_table)
+    dt = DeltaTable(str(tmp_path))
+
+    fs = pa_fs.SubTreeFileSystem(str(tmp_path), pa_fs.LocalFileSystem())
+    result = benchmark(dt.to_pyarrow_table, filesystem=fs)
+    assert result.sort_by("i") == pa.table(sample_table)
+
+
+@pytest.mark.benchmark(group="optimize")
+@pytest.mark.parametrize("max_tasks", [1, 5])
+def test_benchmark_optimize(benchmark, sample_table, tmp_path, max_tasks):
+    # Create 2 partitions, each partition with 10 files.
+    # Each file is about 100MB, so the total size is 2GB.
+    files_per_part = 10
+    parts = ["a", "b", "c", "d", "e"]
+
+    nrows = int(sample_table.num_rows / files_per_part)
+    for part in parts:
+        tab = sample_table.slice(0, nrows)
+        tab = tab.append_column(
+            "part", ChunkedArray(Array([part] * nrows), DataType.int64())
+        )
+        for _ in range(files_per_part):
+            write_deltalake(tmp_path, tab, mode="append", partition_by=["part"])
+
+        dt = DeltaTable(tmp_path)
+        dt = DeltaTable(tmp_path)
+
+    dt = DeltaTable(tmp_path)
+
+    assert len(dt.files()) == files_per_part * len(parts)
+    initial_version = dt.version()
+
+    def setup():
+        # Instead of recreating the table for each benchmark run, we just delete
+        # the optimize log file
+        optimize_version = initial_version + 1
+        try:
+            os.remove(
+                os.path.join(tmp_path, "_delta_log", f"{optimize_version:020}.json")
+            )
+        except FileNotFoundError:
+            pass
+
+        # Reload the table after we have altered the log
+        dt = DeltaTable(tmp_path)
+        assert dt.version() == initial_version
+
+        return (dt,), dict(max_concurrent_tasks=max_tasks)
+
+    def func(dt, max_concurrent_tasks):
+        return dt.optimize.compact(
+            max_concurrent_tasks=max_concurrent_tasks, target_size=1024 * 1024 * 1024
+        )
+
+    # We need to recreate the table for each benchmark run
+    results = benchmark.pedantic(func, setup=setup, rounds=5)
+
+    assert results["numFilesRemoved"] == 50
+    assert results["numFilesAdded"] == 5
+    assert results["partitionsOptimized"] == 5
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
@@ -231,29 +231,6 @@ def test_delta_schema():
     assert schema_without_metadata == Schema.from_arrow(pa_schema)
 
 
-@pytest.mark.pyarrow
-def _generate_test_type():
-    import pyarrow as pa
-
-    class UuidType(pa.ExtensionType):
-        def __init__(self):
-            pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid")
-
-        def __arrow_ext_serialize__(self):
-            # since we don't have a parameterized type, we don't need extra
-            # metadata to be deserialized
-            return b""
-
-        @classmethod
-        def __arrow_ext_deserialize__(self, storage_type, serialized):
-            # return an instance of this subclass given the serialized
-            # metadata.
-            return UuidType()
-
-    pa.register_extension_type(UuidType())
-    return UuidType()
-
-
 # <https://github.com/delta-io/delta-rs/issues/3174>
 def test_field_serialization():
     from deltalake import Field