1818//! This file contains an end to end test of parquet pruning. It writes
1919//! data into a parquet file and then verifies row groups are pruned as
2020//! expected.
21+ use std:: sync:: Arc ;
22+
23+ use arrow:: array:: { ArrayRef , Int32Array , RecordBatch } ;
24+ use arrow_schema:: { DataType , Field , Schema } ;
2125use datafusion:: prelude:: SessionConfig ;
22- use datafusion_common:: ScalarValue ;
26+ use datafusion_common:: { DataFusionError , ScalarValue } ;
2327use itertools:: Itertools ;
2428
2529use crate :: parquet:: Unit :: RowGroup ;
@@ -30,12 +34,12 @@ struct RowGroupPruningTest {
3034 query : String ,
3135 expected_errors : Option < usize > ,
3236 expected_row_group_matched_by_statistics : Option < usize > ,
33- // expected_row_group_fully_matched_by_statistics: Option<usize>,
37+ expected_row_group_fully_matched_by_statistics : Option < usize > ,
3438 expected_row_group_pruned_by_statistics : Option < usize > ,
3539 expected_files_pruned_by_statistics : Option < usize > ,
3640 expected_row_group_matched_by_bloom_filter : Option < usize > ,
3741 expected_row_group_pruned_by_bloom_filter : Option < usize > ,
38- // expected_limit_pruned_row_groups: Option<usize>,
42+ expected_limit_pruned_row_groups : Option < usize > ,
3943 expected_rows : usize ,
4044}
4145impl RowGroupPruningTest {
@@ -47,11 +51,11 @@ impl RowGroupPruningTest {
4751 expected_errors : None ,
4852 expected_row_group_matched_by_statistics : None ,
4953 expected_row_group_pruned_by_statistics : None ,
50- // expected_row_group_fully_matched_by_statistics: None,
54+ expected_row_group_fully_matched_by_statistics : None ,
5155 expected_files_pruned_by_statistics : None ,
5256 expected_row_group_matched_by_bloom_filter : None ,
5357 expected_row_group_pruned_by_bloom_filter : None ,
54- // expected_limit_pruned_row_groups: None,
58+ expected_limit_pruned_row_groups : None ,
5559 expected_rows : 0 ,
5660 }
5761 }
@@ -81,7 +85,6 @@ impl RowGroupPruningTest {
8185 }
8286
8387 // Set the expected fully matched row groups by statistics
84- /*
8588 fn with_fully_matched_by_stats (
8689 mut self ,
8790 fully_matched_by_stats : Option < usize > ,
@@ -90,12 +93,6 @@ impl RowGroupPruningTest {
9093 self
9194 }
9295
93- fn with_limit_pruned_row_groups(mut self, pruned_by_limit: Option<usize>) -> Self {
94- self.expected_limit_pruned_row_groups = pruned_by_limit;
95- self
96- }
97- */
98-
9996 // Set the expected pruned row groups by statistics
10097 fn with_pruned_by_stats ( mut self , pruned_by_stats : Option < usize > ) -> Self {
10198 self . expected_row_group_pruned_by_statistics = pruned_by_stats;
@@ -119,6 +116,11 @@ impl RowGroupPruningTest {
119116 self
120117 }
121118
119+ fn with_limit_pruned_row_groups ( mut self , pruned_by_limit : Option < usize > ) -> Self {
120+ self . expected_limit_pruned_row_groups = pruned_by_limit;
121+ self
122+ }
123+
122124 /// Set the number of expected rows from the output of this test
123125 fn with_expected_rows ( mut self , rows : usize ) -> Self {
124126 self . expected_rows = rows;
@@ -155,12 +157,12 @@ impl RowGroupPruningTest {
155157 ) ;
156158 let bloom_filter_metrics = output. row_groups_bloom_filter ( ) ;
157159 assert_eq ! (
158- bloom_filter_metrics. map( |( _pruned , matched ) | matched ) ,
160+ bloom_filter_metrics. as_ref ( ) . map( |pm| pm . total_matched ( ) ) ,
159161 self . expected_row_group_matched_by_bloom_filter,
160162 "mismatched row_groups_matched_bloom_filter" ,
161163 ) ;
162164 assert_eq ! (
163- bloom_filter_metrics. map( |( pruned , _matched ) | pruned ) ,
165+ bloom_filter_metrics. map( |pm| pm . total_pruned ( ) ) ,
164166 self . expected_row_group_pruned_by_bloom_filter,
165167 "mismatched row_groups_pruned_bloom_filter" ,
166168 ) ;
@@ -175,6 +177,64 @@ impl RowGroupPruningTest {
175177 ) ;
176178 }
177179
180+ // Execute the test with the current configuration
181+ async fn test_row_group_prune_with_custom_data (
182+ self ,
183+ schema : Arc < Schema > ,
184+ batches : Vec < RecordBatch > ,
185+ max_row_per_group : usize ,
186+ ) {
187+ let output = ContextWithParquet :: with_custom_data (
188+ self . scenario ,
189+ RowGroup ( max_row_per_group) ,
190+ schema,
191+ batches,
192+ )
193+ . await
194+ . query ( & self . query )
195+ . await ;
196+
197+ println ! ( "{}" , output. description( ) ) ;
198+ assert_eq ! (
199+ output. predicate_evaluation_errors( ) ,
200+ self . expected_errors,
201+ "mismatched predicate_evaluation error"
202+ ) ;
203+ assert_eq ! (
204+ output. row_groups_matched_statistics( ) ,
205+ self . expected_row_group_matched_by_statistics,
206+ "mismatched row_groups_matched_statistics" ,
207+ ) ;
208+ assert_eq ! (
209+ output. row_groups_fully_matched_statistics( ) ,
210+ self . expected_row_group_fully_matched_by_statistics,
211+ "mismatched row_groups_fully_matched_statistics" ,
212+ ) ;
213+ assert_eq ! (
214+ output. row_groups_pruned_statistics( ) ,
215+ self . expected_row_group_pruned_by_statistics,
216+ "mismatched row_groups_pruned_statistics" ,
217+ ) ;
218+ assert_eq ! (
219+ output. files_ranges_pruned_statistics( ) ,
220+ self . expected_files_pruned_by_statistics,
221+ "mismatched files_ranges_pruned_statistics" ,
222+ ) ;
223+ assert_eq ! (
224+ output. limit_pruned_row_groups( ) ,
225+ self . expected_limit_pruned_row_groups,
226+ "mismatched limit_pruned_row_groups" ,
227+ ) ;
228+ assert_eq ! (
229+ output. result_rows,
230+ self . expected_rows,
231+ "Expected {} rows, got {}: {}" ,
232+ output. result_rows,
233+ self . expected_rows,
234+ output. description( ) ,
235+ ) ;
236+ }
237+
178238 // Execute the test with the current configuration
179239 /*
180240 async fn test_row_group_prune_with_custom_data(
@@ -1723,7 +1783,6 @@ async fn test_bloom_filter_decimal_dict() {
17231783 . await ;
17241784}
17251785
1726- /*
17271786// Helper function to create a batch with a single Int32 column.
17281787fn make_i32_batch (
17291788 name : & str ,
@@ -1950,15 +2009,13 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error:
19502009 . with_scenario ( Scenario :: Int )
19512010 . with_query ( query)
19522011 . with_expected_errors ( Some ( 0 ) )
1953- .with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit)
2012+ . with_expected_rows ( 10 ) // Total: 1 + 4 + 4 + 1 = 10
19542013 . with_pruned_files ( Some ( 0 ) )
19552014 . with_matched_by_stats ( Some ( 4 ) ) // RG0,1,2,3 matched
19562015 . with_fully_matched_by_stats ( Some ( 2 ) )
19572016 . with_pruned_by_stats ( Some ( 1 ) ) // RG4 pruned
19582017 . with_limit_pruned_row_groups ( Some ( 0 ) ) // No limit pruning since we need all RGs
19592018 . test_row_group_prune_with_custom_data ( schema, batches, 4 )
19602019 . await ;
1961-
19622020 Ok ( ( ) )
19632021}
1964- */
0 commit comments