Skip to content

Commit 888b03f

Browse files
committed
chore: pr feedback
Signed-off-by: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
1 parent 92e75bb commit 888b03f

7 files changed

Lines changed: 132 additions & 44 deletions

File tree

.github/workflows/python_benchmark.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939

4040
- name: Run benchmark
4141
run: |
42-
uv run pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json
42+
uv run pytest tests/test_benchmark.py -m '(benchmark and pyarrow)' --benchmark-json output.json
4343
4444
- name: Store benchmark result
4545
uses: benchmark-action/github-action-benchmark@v1
@@ -48,4 +48,3 @@ jobs:
4848
output-file-path: python/output.json
4949
external-data-json-path: ./cache/benchmark-data.json
5050
fail-on-alert: true
51-

.github/workflows/python_build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
run: make unit-test
5050

5151
test:
52-
name: Python Build (Python 3.10 Optional latest pyarrow)
52+
name: Python Build (Python 3.10 PyArrow latest)
5353
runs-on: ubuntu-latest
5454
env:
5555
SCCACHE_GHA_ENABLED: "true"

python/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ doc = false
1717
[dependencies]
1818
delta_kernel.workspace = true
1919

20-
pyo3-arrow = { version = "*", default-features = false}
20+
pyo3-arrow = { version = "0.9.0", default-features = false}
2121

2222
# arrow
2323
arrow-schema = { workspace = true, features = ["serde"] }

python/deltalake/_internal.pyi

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -404,13 +404,12 @@ class PrimitiveType:
404404

405405
@staticmethod
406406
def from_arrow(type: ArrowSchemaExportable) -> PrimitiveType:
407-
"""Create a PrimitiveType from an object with an `__arrow_c_schema__`
408-
datatype
407+
"""Create a PrimitiveType from an `ArrowSchemaExportable` datatype
409408
410409
Will raise `TypeError` if the arrow type is not a primitive type.
411410
412411
Args:
413-
type: an object with an `__arrow_c_schema__
412+
type: an object that is `ArrowSchemaExportable`
414413
415414
Returns:
416415
a PrimitiveType
@@ -494,13 +493,12 @@ class ArrayType:
494493
"""Get the equivalent arro3 type."""
495494
@staticmethod
496495
def from_arrow(type: ArrowSchemaExportable) -> ArrayType:
497-
"""Create an ArrayType from an object with an `__arrow_c_schema__`
498-
datatype.
496+
"""Create an ArrayType from an `ArrowSchemaExportable` datatype.
499497
500498
Will raise `TypeError` if a different arrow DataType is provided.
501499
502500
Args:
503-
type: an object with an `__arrow_c_schema__
501+
type: an object that is `ArrowSchemaExportable`
504502
505503
Returns:
506504
an ArrayType
@@ -602,13 +600,12 @@ class MapType:
602600

603601
@staticmethod
604602
def from_arrow(type: ArrowSchemaExportable) -> MapType:
605-
"""Create a MapType from an object with an `__arrow_c_schema__`
606-
datatype.
603+
"""Create a MapType from an `ArrowSchemaExportable` datatype
607604
608605
Will raise `TypeError` if passed a different type.
609606
610607
Args:
611-
type: an object with an `__arrow_c_schema__
608+
type: an object that is `ArrowSchemaExportable`
612609
613610
Returns:
614611
a MapType
@@ -716,13 +713,12 @@ class Field:
716713
"""
717714
@staticmethod
718715
def from_arrow(field: ArrowSchemaExportable) -> Field:
719-
"""Create a Field from an object with an `__arrow_c_schema__`
720-
field
716+
"""Create a Field from an object with an `ArrowSchemaExportable` field
721717
722718
Note: This currently doesn't preserve field metadata.
723719
724720
Args:
725-
field: a Field object with an `__arrow_c_schema__
721+
field: a Field object that is `ArrowSchemaExportable`
726722
727723
Returns:
728724
a Field
@@ -793,13 +789,12 @@ class StructType:
793789

794790
@staticmethod
795791
def from_arrow(type: ArrowSchemaExportable) -> StructType:
796-
"""Create a new StructType from an object with an `__arrow_c_schema__`
797-
datatype.
792+
"""Create a new StructType from an `ArrowSchemaExportable` datatype
798793
799794
Will raise `TypeError` if a different data type is provided.
800795
801796
Args:
802-
type: a struct type object with an `__arrow_c_schema__
797+
type: a struct type object that is `ArrowSchemaExportable`
803798
804799
Returns:
805800
a StructType
@@ -872,7 +867,7 @@ class Schema:
872867
Will raise `TypeError` if one of the Arrow type is not a primitive type.
873868
874869
Args:
875-
type: an object with an `__arrow_c_schema__
870+
type: an object that is `ArrowSchemaExportable`
876871
877872
Returns:
878873
a Schema

python/deltalake/fs/_base_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def from_table(
3232
table: RawDeltaTable,
3333
options: dict[str, str] | None = None,
3434
known_sizes: dict[str, int] | None = None,
35-
) -> "BaseDeltaStorageHandler":
35+
) -> BaseDeltaStorageHandler:
3636
self = cls.__new__(cls)
3737
self._handler = DeltaFileSystemHandler.from_table(table, options, known_sizes)
3838
return self

python/tests/test_benchmark.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import os
2+
3+
import pytest
4+
from arro3.core import Array, ChunkedArray, DataType, Table
5+
from numpy.random import standard_normal
6+
7+
from deltalake import DeltaTable, QueryBuilder, write_deltalake
8+
9+
# NOTE: make sure to run these in release mode with
10+
# MATURIN_EXTRA_ARGS=--release make develop
11+
# When profiling, use:
12+
# MATURIN_EXTRA_ARGS="--profile release-with-debug" make develop
13+
14+
15+
@pytest.fixture()
16+
def sample_table() -> Table:
17+
max_size_bytes = 128 * 1024 * 1024
18+
ncols = 20
19+
nrows = max_size_bytes // 20 // 8
20+
tab = Table.from_pydict({f"x{i}": standard_normal(nrows) for i in range(ncols)})
21+
# Add index column for sorting
22+
tab = tab.append_column(
23+
"i", ChunkedArray(Array(range(nrows), type=DataType.int64()))
24+
)
25+
return tab
26+
27+
28+
@pytest.mark.benchmark(group="write")
29+
def test_benchmark_write(benchmark, sample_table, tmp_path):
30+
benchmark(write_deltalake, str(tmp_path), sample_table, mode="overwrite")
31+
32+
dt = DeltaTable(str(tmp_path))
33+
assert (
34+
QueryBuilder().register("tbl", dt).execute("select * from tbl order by id")
35+
== sample_table
36+
)
37+
38+
39+
@pytest.mark.pyarrow
40+
@pytest.mark.benchmark(group="read")
41+
def test_benchmark_read(benchmark, sample_table, tmp_path):
42+
import pyarrow as pa
43+
44+
write_deltalake(str(tmp_path), sample_table)
45+
dt = DeltaTable(str(tmp_path))
46+
47+
result = benchmark(dt.to_pyarrow_table)
48+
assert result.sort_by("i") == pa.table(sample_table)
49+
50+
51+
@pytest.mark.pyarrow
52+
@pytest.mark.benchmark(group="read")
53+
def test_benchmark_read_pyarrow(benchmark, sample_table, tmp_path):
54+
import pyarrow as pa
55+
import pyarrow.fs as pa_fs
56+
57+
write_deltalake(str(tmp_path), sample_table)
58+
dt = DeltaTable(str(tmp_path))
59+
60+
fs = pa_fs.SubTreeFileSystem(str(tmp_path), pa_fs.LocalFileSystem())
61+
result = benchmark(dt.to_pyarrow_table, filesystem=fs)
62+
assert result.sort_by("i") == pa.table(sample_table)
63+
64+
65+
@pytest.mark.benchmark(group="optimize")
66+
@pytest.mark.parametrize("max_tasks", [1, 5])
67+
def test_benchmark_optimize(benchmark, sample_table, tmp_path, max_tasks):
68+
# Create 2 partitions, each partition with 10 files.
69+
# Each file is about 100MB, so the total size is 2GB.
70+
files_per_part = 10
71+
parts = ["a", "b", "c", "d", "e"]
72+
73+
nrows = int(sample_table.num_rows / files_per_part)
74+
for part in parts:
75+
tab = sample_table.slice(0, nrows)
76+
tab = tab.append_column(
77+
"part", ChunkedArray(Array([part] * nrows), DataType.int64())
78+
)
79+
for _ in range(files_per_part):
80+
write_deltalake(tmp_path, tab, mode="append", partition_by=["part"])
81+
82+
dt = DeltaTable(tmp_path)
83+
dt = DeltaTable(tmp_path)
84+
85+
dt = DeltaTable(tmp_path)
86+
87+
assert len(dt.files()) == files_per_part * len(parts)
88+
initial_version = dt.version()
89+
90+
def setup():
91+
# Instead of recreating the table for each benchmark run, we just delete
92+
# the optimize log file
93+
optimize_version = initial_version + 1
94+
try:
95+
os.remove(
96+
os.path.join(tmp_path, "_delta_log", f"{optimize_version:020}.json")
97+
)
98+
except FileNotFoundError:
99+
pass
100+
101+
# Reload the table after we have altered the log
102+
dt = DeltaTable(tmp_path)
103+
assert dt.version() == initial_version
104+
105+
return (dt,), dict(max_concurrent_tasks=max_tasks)
106+
107+
def func(dt, max_concurrent_tasks):
108+
return dt.optimize.compact(
109+
max_concurrent_tasks=max_concurrent_tasks, target_size=1024 * 1024 * 1024
110+
)
111+
112+
# We need to recreate the table for each benchmark run
113+
results = benchmark.pedantic(func, setup=setup, rounds=5)
114+
115+
assert results["numFilesRemoved"] == 50
116+
assert results["numFilesAdded"] == 5
117+
assert results["partitionsOptimized"] == 5

python/tests/test_schema.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -231,29 +231,6 @@ def test_delta_schema():
231231
assert schema_without_metadata == Schema.from_arrow(pa_schema)
232232

233233

234-
@pytest.mark.pyarrow
235-
def _generate_test_type():
236-
import pyarrow as pa
237-
238-
class UuidType(pa.ExtensionType):
239-
def __init__(self):
240-
pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid")
241-
242-
def __arrow_ext_serialize__(self):
243-
# since we don't have a parameterized type, we don't need extra
244-
# metadata to be deserialized
245-
return b""
246-
247-
@classmethod
248-
def __arrow_ext_deserialize__(self, storage_type, serialized):
249-
# return an instance of this subclass given the serialized
250-
# metadata.
251-
return UuidType()
252-
253-
pa.register_extension_type(UuidType())
254-
return UuidType()
255-
256-
257234
# <https://github.com/delta-io/delta-rs/issues/3174>
258235
def test_field_serialization():
259236
from deltalake import Field

0 commit comments

Comments
 (0)