|
| 1 | +#![cfg(feature = "datafusion")] |
| 2 | + |
| 3 | +use std::sync::Arc; |
| 4 | + |
| 5 | +use arrow_array::{Int32Array, RecordBatch, StringArray}; |
| 6 | +use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema}; |
| 7 | +use datafusion::datasource::TableProvider; |
| 8 | +use datafusion::physical_plan::collect; |
| 9 | +use datafusion::prelude::{SessionContext, col, lit}; |
| 10 | +use deltalake_core::DeltaTable; |
| 11 | +use deltalake_core::delta_datafusion::bench_support; |
| 12 | +use deltalake_core::kernel::{DataType, PrimitiveType, StructField}; |
| 13 | +use deltalake_core::protocol::SaveMode; |
| 14 | +use deltalake_test::TestResult; |
| 15 | + |
| 16 | +fn batch(ids: Vec<i32>, parts: Vec<&str>) -> TestResult<RecordBatch> { |
| 17 | + Ok(RecordBatch::try_new( |
| 18 | + Arc::new(ArrowSchema::new(vec![ |
| 19 | + ArrowField::new("id", ArrowDataType::Int32, false), |
| 20 | + ArrowField::new("part", ArrowDataType::Utf8, false), |
| 21 | + ])), |
| 22 | + vec![ |
| 23 | + Arc::new(Int32Array::from(ids)), |
| 24 | + Arc::new(StringArray::from(parts)), |
| 25 | + ], |
| 26 | + )?) |
| 27 | +} |
| 28 | + |
| 29 | +#[tokio::test] |
| 30 | +async fn test_out_of_crate_bridge_exposes_file_selection_paths() -> TestResult { |
| 31 | + let table = DeltaTable::new_in_memory() |
| 32 | + .create() |
| 33 | + .with_columns(vec![ |
| 34 | + StructField::new( |
| 35 | + "id".to_string(), |
| 36 | + DataType::Primitive(PrimitiveType::Integer), |
| 37 | + false, |
| 38 | + ), |
| 39 | + StructField::new( |
| 40 | + "part".to_string(), |
| 41 | + DataType::Primitive(PrimitiveType::String), |
| 42 | + false, |
| 43 | + ), |
| 44 | + ]) |
| 45 | + .with_partition_columns(["part"]) |
| 46 | + .await?; |
| 47 | + let table = table |
| 48 | + .write(vec![batch(vec![1, 2], vec!["a", "a"])?]) |
| 49 | + .with_save_mode(SaveMode::Append) |
| 50 | + .await?; |
| 51 | + let table = table |
| 52 | + .write(vec![batch(vec![100, 101], vec!["b", "b"])?]) |
| 53 | + .with_save_mode(SaveMode::Append) |
| 54 | + .await?; |
| 55 | + |
| 56 | + let snapshot = table.snapshot()?.snapshot().clone(); |
| 57 | + let log_store = table.log_store(); |
| 58 | + let session = SessionContext::new().state(); |
| 59 | + |
| 60 | + let partition_result = bench_support::find_files( |
| 61 | + &snapshot, |
| 62 | + log_store.clone(), |
| 63 | + &session, |
| 64 | + Some(col("part").eq(lit("a"))), |
| 65 | + ) |
| 66 | + .await?; |
| 67 | + assert!(partition_result.partition_scan); |
| 68 | + assert_eq!(partition_result.candidates.len(), 1); |
| 69 | + assert!( |
| 70 | + partition_result.candidates[0].path.contains("part=a/"), |
| 71 | + "expected partition-only path to match partition file, got {}", |
| 72 | + partition_result.candidates[0].path |
| 73 | + ); |
| 74 | + |
| 75 | + let mem_table = bench_support::add_actions_partition_mem_table(&snapshot)? |
| 76 | + .expect("partition mem table should exist"); |
| 77 | + let field_names = mem_table |
| 78 | + .schema() |
| 79 | + .fields() |
| 80 | + .iter() |
| 81 | + .map(|field| field.name().clone()) |
| 82 | + .collect::<Vec<_>>(); |
| 83 | + assert_eq!(field_names, vec!["__delta_rs_path", "part"]); |
| 84 | + |
| 85 | + let data_predicate = col("id").gt(lit(50i32)); |
| 86 | + let data_result = bench_support::find_files( |
| 87 | + &snapshot, |
| 88 | + log_store.clone(), |
| 89 | + &session, |
| 90 | + Some(data_predicate.clone()), |
| 91 | + ) |
| 92 | + .await?; |
| 93 | + assert!(!data_result.partition_scan); |
| 94 | + assert_eq!(data_result.candidates.len(), 1); |
| 95 | + assert!( |
| 96 | + data_result.candidates[0].path.contains("part=b/"), |
| 97 | + "expected data predicate to match partition b file, got {}", |
| 98 | + data_result.candidates[0].path |
| 99 | + ); |
| 100 | + |
| 101 | + let direct_scan = bench_support::find_files_scan( |
| 102 | + &snapshot, |
| 103 | + log_store.clone(), |
| 104 | + &session, |
| 105 | + data_predicate.clone(), |
| 106 | + ) |
| 107 | + .await?; |
| 108 | + assert_eq!(direct_scan.len(), 1); |
| 109 | + assert_eq!(direct_scan[0].path, data_result.candidates[0].path); |
| 110 | + |
| 111 | + let matched_scan = |
| 112 | + bench_support::scan_files_where_matches(&session, &snapshot, log_store, data_predicate) |
| 113 | + .await? |
| 114 | + .expect("matched file scan should exist"); |
| 115 | + assert_eq!(matched_scan.files_set().len(), 1); |
| 116 | + assert_eq!(matched_scan.predicate(), &col("id").gt(lit(50i32))); |
| 117 | + |
| 118 | + let plan = session.create_physical_plan(matched_scan.scan()).await?; |
| 119 | + let batches = collect(plan, session.task_ctx()).await?; |
| 120 | + let row_count = batches.iter().map(|batch| batch.num_rows()).sum::<usize>(); |
| 121 | + assert_eq!(row_count, 2); |
| 122 | + |
| 123 | + Ok(()) |
| 124 | +} |
0 commit comments