@@ -46,6 +46,7 @@ use std::error::Error;
4646mod common;
4747
4848mod local {
49+ use datafusion:: common:: stats:: Precision ;
4950 use deltalake_core:: writer:: JsonWriter ;
5051
5152 use super :: * ;
@@ -281,67 +282,146 @@ mod local {
281282
282283 #[ tokio:: test]
283284 async fn test_datafusion_stats ( ) -> Result < ( ) > {
285+ // Validate a table that contains statisitics for all files
284286 let table = open_table ( "./tests/data/delta-0.8.0" ) . await . unwrap ( ) ;
285- let statistics = table. state . datafusion_table_statistics ( ) ;
287+ let statistics = table. state . datafusion_table_statistics ( ) ? ;
286288
287- assert_eq ! ( statistics. num_rows, Some ( 4 ) , ) ;
289+ assert_eq ! ( statistics. num_rows, Precision :: Exact ( 4_usize ) , ) ;
288290
289- assert_eq ! ( statistics. total_byte_size, Some ( 440 + 440 ) ) ;
291+ assert_eq ! (
292+ statistics. total_byte_size,
293+ Precision :: Exact ( ( 440 + 440 ) as usize )
294+ ) ;
290295
296+ let column_stats = statistics. column_statistics . get ( 0 ) . unwrap ( ) ;
297+ assert_eq ! ( column_stats. null_count, Precision :: Exact ( 0 ) ) ;
291298 assert_eq ! (
292- statistics
293- . column_statistics
294- . clone( )
295- . unwrap( )
296- . iter( )
297- . map( |x| x. null_count)
298- . collect:: <Vec <Option <usize >>>( ) ,
299- vec![ Some ( 0 ) ] ,
299+ column_stats. max_value,
300+ Precision :: Exact ( ScalarValue :: from( 4_i32 ) )
301+ ) ;
302+ assert_eq ! (
303+ column_stats. min_value,
304+ Precision :: Exact ( ScalarValue :: from( 0_i32 ) )
300305 ) ;
301306
302307 let ctx = SessionContext :: new ( ) ;
303308 ctx. register_table ( "test_table" , Arc :: new ( table) ) ?;
304-
305- let batches = ctx
309+ let actual = ctx
306310 . sql ( "SELECT max(value), min(value) FROM test_table" )
307311 . await ?
308312 . collect ( )
309313 . await ?;
310314
311- assert_eq ! ( batches. len( ) , 1 ) ;
312- let batch = & batches[ 0 ] ;
315+ let expected = vec ! [
316+ "+-----------------------+-----------------------+" ,
317+ "| MAX(test_table.value) | MIN(test_table.value) |" ,
318+ "+-----------------------+-----------------------+" ,
319+ "| 4 | 0 |" ,
320+ "+-----------------------+-----------------------+" ,
321+ ] ;
322+ assert_batches_sorted_eq ! ( & expected, & actual) ;
323+
324+ // Validate a table that does not contain column statisitics
325+ let table = open_table ( "./tests/data/delta-0.2.0" ) . await . unwrap ( ) ;
326+ let statistics = table. state . datafusion_table_statistics ( ) ?;
327+
328+ assert_eq ! ( statistics. num_rows, Precision :: Absent ) ;
329+
313330 assert_eq ! (
314- batch . column ( 0 ) . as_ref ( ) ,
315- Arc :: new ( Int32Array :: from ( vec! [ 4 ] ) ) . as_ref ( ) ,
331+ statistics . total_byte_size ,
332+ Precision :: Exact ( ( 400 + 404 + 396 ) as usize )
316333 ) ;
334+ let column_stats = statistics. column_statistics . get ( 0 ) . unwrap ( ) ;
335+ assert_eq ! ( column_stats. null_count, Precision :: Absent ) ;
336+ assert_eq ! ( column_stats. max_value, Precision :: Absent ) ;
337+ assert_eq ! ( column_stats. min_value, Precision :: Absent ) ;
338+
339+ ctx. register_table ( "test_table2" , Arc :: new ( table) ) ?;
340+ let actual = ctx
341+ . sql ( "SELECT max(value), min(value) FROM test_table2" )
342+ . await ?
343+ . collect ( )
344+ . await ?;
317345
346+ let expected = vec ! [
347+ "+------------------------+------------------------+" ,
348+ "| MAX(test_table2.value) | MIN(test_table2.value) |" ,
349+ "+------------------------+------------------------+" ,
350+ "| 3 | 1 |" ,
351+ "+------------------------+------------------------+" ,
352+ ] ;
353+ assert_batches_sorted_eq ! ( & expected, & actual) ;
354+
355+ // Validate a table that contains nested structures.
356+
357+ // This table is interesting since it goes through schema evolution.
358+ // In particular 'new_column' contains statistics for when it
359+ // is introduced (10) but the commit following (11) does not contain
360+ // statistics for this column.
361+ let table = open_table ( "./tests/data/delta-1.2.1-only-struct-stats" )
362+ . await
363+ . unwrap ( ) ;
364+ let schema = table. get_schema ( ) . unwrap ( ) ;
365+ let statistics = table. state . datafusion_table_statistics ( ) ?;
366+ assert_eq ! ( statistics. num_rows, Precision :: Exact ( 12 ) ) ;
367+
368+ // `new_column` statistics
369+ let stats = statistics
370+ . column_statistics
371+ . get ( schema. index_of ( "new_column" ) . unwrap ( ) )
372+ . unwrap ( ) ;
373+ assert_eq ! ( stats. null_count, Precision :: Absent ) ;
374+ assert_eq ! ( stats. min_value, Precision :: Absent ) ;
375+ assert_eq ! ( stats. max_value, Precision :: Absent ) ;
376+
377+ // `date` statistics
378+ let stats = statistics
379+ . column_statistics
380+ . get ( schema. index_of ( "date" ) . unwrap ( ) )
381+ . unwrap ( ) ;
382+ assert_eq ! ( stats. null_count, Precision :: Exact ( 0 ) ) ;
383+ // 2022-10-24
318384 assert_eq ! (
319- batch . column ( 1 ) . as_ref ( ) ,
320- Arc :: new ( Int32Array :: from ( vec! [ 0 ] ) ) . as_ref ( ) ,
385+ stats . min_value ,
386+ Precision :: Exact ( ScalarValue :: Date32 ( Some ( 19289 ) ) )
321387 ) ;
322-
323388 assert_eq ! (
324- statistics
325- . column_statistics
326- . clone( )
327- . unwrap( )
328- . iter( )
329- . map( |x| x. max_value. as_ref( ) )
330- . collect:: <Vec <Option <& ScalarValue >>>( ) ,
331- vec![ Some ( & ScalarValue :: from( 4_i32 ) ) ] ,
389+ stats. max_value,
390+ Precision :: Exact ( ScalarValue :: Date32 ( Some ( 19289 ) ) )
332391 ) ;
333392
393+ // `timestamp` statistics
394+ let stats = statistics
395+ . column_statistics
396+ . get ( schema. index_of ( "timestamp" ) . unwrap ( ) )
397+ . unwrap ( ) ;
398+ assert_eq ! ( stats. null_count, Precision :: Exact ( 0 ) ) ;
399+ // 2022-10-24T22:59:32.846Z
334400 assert_eq ! (
335- statistics
336- . column_statistics
337- . clone( )
338- . unwrap( )
339- . iter( )
340- . map( |x| x. min_value. as_ref( ) )
341- . collect:: <Vec <Option <& ScalarValue >>>( ) ,
342- vec![ Some ( & ScalarValue :: from( 0_i32 ) ) ] ,
401+ stats. min_value,
402+ Precision :: Exact ( ScalarValue :: TimestampMicrosecond (
403+ Some ( 1666652372846000 ) ,
404+ None
405+ ) )
406+ ) ;
407+ // 2022-10-24T22:59:46.083Z
408+ assert_eq ! (
409+ stats. max_value,
410+ Precision :: Exact ( ScalarValue :: TimestampMicrosecond (
411+ Some ( 1666652386083000 ) ,
412+ None
413+ ) )
343414 ) ;
344415
416+ // `struct_element` statistics
417+ let stats = statistics
418+ . column_statistics
419+ . get ( schema. index_of ( "nested_struct" ) . unwrap ( ) )
420+ . unwrap ( ) ;
421+ assert_eq ! ( stats. null_count, Precision :: Absent ) ;
422+ assert_eq ! ( stats. min_value, Precision :: Absent ) ;
423+ assert_eq ! ( stats. max_value, Precision :: Absent ) ;
424+
345425 Ok ( ( ) )
346426 }
347427
@@ -782,14 +862,14 @@ mod local {
782862
783863 let expected_schema = ArrowSchema :: new ( vec ! [
784864 ArrowField :: new( "c3" , ArrowDataType :: Int32 , true ) ,
785- ArrowField :: new( "c1" , ArrowDataType :: Int32 , false ) ,
865+ ArrowField :: new( "c1" , ArrowDataType :: Int32 , true ) ,
786866 ArrowField :: new(
787867 "c2" ,
788868 ArrowDataType :: Dictionary (
789869 Box :: new( ArrowDataType :: UInt16 ) ,
790870 Box :: new( ArrowDataType :: Utf8 ) ,
791871 ) ,
792- false ,
872+ true ,
793873 ) ,
794874 ] ) ;
795875
0 commit comments