Skip to content

Commit 5121983

Browse files
authored
fix(python): guard DataFusion FFI export on datafusion major version (delta-io#4142)
# Description - Add a runtime version check in __datafusion_table_provider__ to prevent FFI ABI mismatch segfaults - Block capsule export when installed datafusion major != 52 - Provide actionable error text with QueryBuilder workaround Changes: - lib.rs: add REQUIRED_DATAFUSION_PY_MAJOR, datafusion_python_version(), guard at method start - test_datafusion.py: add incompatible version and not installed tests Note: This guard is a temporary safety net to prevent segfaults until DataFusion 52 Python wheels are available on PyPI. Once wheels land, users can install datafusion==52.* and use SessionContext registration normally. # Related Issue(s) - delta-io#4135 <!--- For example: - closes delta-io#106 ---> # Documentation <!--- Share links to useful documentation ---> Signed-off-by: Ethan Urbanski <ethan@urbanskitech.com>
1 parent 6ee66e2 commit 5121983

2 files changed

Lines changed: 82 additions & 1 deletion

File tree

python/src/lib.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ use deltalake::{DeltaResult, DeltaTable, DeltaTableBuilder, init_client_version}
5252
use futures::TryStreamExt;
5353
use pyo3::exceptions::{PyRuntimeError, PyValueError};
5454
use pyo3::pybacked::PyBackedStr;
55-
use pyo3::types::{PyCapsule, PyDict, PyFrozenSet};
55+
use pyo3::types::{PyCapsule, PyDict, PyFrozenSet, PyModule};
5656
use pyo3::{IntoPyObjectExt, prelude::*};
5757
use pyo3_arrow::export::{Arro3RecordBatch, Arro3RecordBatchReader};
5858
use pyo3_arrow::{PyRecordBatchReader, PySchema as PyArrowSchema};
@@ -115,6 +115,17 @@ struct RawDeltaTableMetaData {
115115

116116
type StringVec = Vec<String>;
117117

118+
const REQUIRED_DATAFUSION_PY_MAJOR: u32 = 52;
119+
120+
fn datafusion_python_version(py: Python<'_>) -> Option<String> {
121+
let importlib_metadata = PyModule::import(py, "importlib.metadata").ok()?;
122+
importlib_metadata
123+
.call_method1("version", ("datafusion",))
124+
.ok()?
125+
.extract()
126+
.ok()
127+
}
128+
118129
/// Segmented impl for RawDeltaTable to avoid these methods being exposed via the pymethods macro.
119130
///
120131
/// In essence all these functions should be considered internal to the Rust code and not exposed
@@ -1838,6 +1849,24 @@ impl RawDeltaTable {
18381849
&self,
18391850
py: Python<'py>,
18401851
) -> PyResult<Bound<'py, PyCapsule>> {
1852+
let found_version = datafusion_python_version(py);
1853+
let found_major = found_version
1854+
.as_deref()
1855+
.and_then(|version| version.split('.').next()?.parse().ok());
1856+
if found_major != Some(REQUIRED_DATAFUSION_PY_MAJOR) {
1857+
let found = found_version.unwrap_or_else(|| "not installed".to_string());
1858+
return Err(PyRuntimeError::new_err(format!(
1859+
"DataFusion Python integration requires datafusion=={required}.x (found: {found}).\n\n\
1860+
This deltalake build exports a DataFusion {required}.x FFI TableProvider; mismatched majors can segfault.\n\n\
1861+
Workaround (no DataFusion-Python required): use deltalake.QueryBuilder:\n\n\
1862+
from deltalake import DeltaTable, QueryBuilder\n\n\
1863+
dt = DeltaTable('path/to/table')\n\
1864+
data = QueryBuilder().register('tbl', dt).execute('SELECT * FROM tbl')\n\n\
1865+
Install datafusion=={required}.* (matching major) to use DataFusion SessionContext registration, or keep using QueryBuilder.",
1866+
required = REQUIRED_DATAFUSION_PY_MAJOR,
1867+
)));
1868+
}
1869+
18411870
let handle = rt().handle();
18421871
let name = CString::new("datafusion_table_provider").unwrap();
18431872
let table = self.with_table(|t| Ok(t.clone()))?;

python/tests/test_datafusion.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,58 @@ def _datafusion_major_version() -> int | None:
1414
return None
1515

1616

17+
def test_datafusion_table_provider_incompatible_version_errors(tmp_path, monkeypatch):
18+
# Force the runtime check to behave like "datafusion==51.x" even if datafusion isn't installed.
19+
call_count = {"count": 0}
20+
21+
def fake_version(pkg: str) -> str:
22+
assert pkg == "datafusion"
23+
call_count["count"] += 1
24+
if call_count["count"] == 1:
25+
return "51.0.0"
26+
return "52.0.0"
27+
28+
monkeypatch.setattr("importlib.metadata.version", fake_version)
29+
30+
table = Table(
31+
{"id": Array([1, 2, 3], Field("id", type=DataType.int64(), nullable=True))}
32+
)
33+
write_deltalake(tmp_path, table)
34+
dt = DeltaTable(tmp_path)
35+
36+
# Call the FFI export hook directly; DO NOT call SessionContext.register_* (that is what segfaults).
37+
with pytest.raises(RuntimeError) as exc_info:
38+
dt.__datafusion_table_provider__() # type: ignore[attr-defined]
39+
40+
assert call_count["count"] == 1
41+
42+
msg = str(exc_info.value)
43+
assert "datafusion" in msg
44+
assert "datafusion==52" in msg
45+
assert "QueryBuilder" in msg
46+
47+
48+
def test_datafusion_table_provider_not_installed_errors(tmp_path, monkeypatch):
49+
def fake_version(pkg: str) -> str:
50+
raise PackageNotFoundError(pkg)
51+
52+
monkeypatch.setattr("importlib.metadata.version", fake_version)
53+
54+
table = Table(
55+
{"id": Array([1, 2, 3], Field("id", type=DataType.int64(), nullable=True))}
56+
)
57+
write_deltalake(tmp_path, table)
58+
dt = DeltaTable(tmp_path)
59+
60+
with pytest.raises(RuntimeError) as exc_info:
61+
dt.__datafusion_table_provider__() # type: ignore[attr-defined]
62+
63+
msg = str(exc_info.value)
64+
assert "datafusion" in msg
65+
assert "not installed" in msg.lower()
66+
assert "QueryBuilder" in msg
67+
68+
1769
@pytest.mark.datafusion
1870
def test_datafusion_table_provider(tmp_path):
1971
if os.environ.get("DELTALAKE_RUN_DATAFUSION_TESTS") != "1":

0 commit comments

Comments
 (0)