@@ -27,19 +27,42 @@ extern crate uuid;
2727
2828use apache_avro:: types:: Value ;
2929use apache_avro:: { to_avro_datum, Decimal , Schema as ApacheSchema } ;
30- use arrow_avro:: schema:: { Fingerprint , SINGLE_OBJECT_MAGIC } ;
30+ use arrow_avro:: schema:: { Fingerprint , FingerprintAlgorithm , CONFLUENT_MAGIC , SINGLE_OBJECT_MAGIC } ;
3131use arrow_avro:: { reader:: ReaderBuilder , schema:: AvroSchema } ;
3232use criterion:: { criterion_group, criterion_main, BatchSize , BenchmarkId , Criterion , Throughput } ;
3333use once_cell:: sync:: Lazy ;
3434use std:: { hint:: black_box, time:: Duration } ;
3535use uuid:: Uuid ;
3636
37- fn make_prefix ( fp : Fingerprint ) -> [ u8 ; 10 ] {
38- let Fingerprint :: Rabin ( val) = fp;
39- let mut buf = [ 0u8 ; 10 ] ;
40- buf[ ..2 ] . copy_from_slice ( & SINGLE_OBJECT_MAGIC ) ; // C3 01
41- buf[ 2 ..] . copy_from_slice ( & val. to_le_bytes ( ) ) ; // little‑endian 64‑bit
42- buf
37+ fn make_prefix ( fp : Fingerprint ) -> Vec < u8 > {
38+ match fp {
39+ Fingerprint :: Rabin ( val) => {
40+ let mut buf = Vec :: with_capacity ( SINGLE_OBJECT_MAGIC . len ( ) + size_of :: < u64 > ( ) ) ;
41+ buf. extend_from_slice ( & SINGLE_OBJECT_MAGIC ) ; // C3 01
42+ buf. extend_from_slice ( & val. to_le_bytes ( ) ) ; // little-endian
43+ buf
44+ }
45+ Fingerprint :: Id ( id) => {
46+ let mut buf = Vec :: with_capacity ( CONFLUENT_MAGIC . len ( ) + size_of :: < u32 > ( ) ) ;
47+ buf. extend_from_slice ( & CONFLUENT_MAGIC ) ; // 00
48+ buf. extend_from_slice ( & id. to_be_bytes ( ) ) ; // big-endian
49+ buf
50+ }
51+ #[ cfg( feature = "md5" ) ]
52+ Fingerprint :: MD5 ( val) => {
53+ let mut buf = Vec :: with_capacity ( SINGLE_OBJECT_MAGIC . len ( ) + size_of_val ( & val) ) ;
54+ buf. extend_from_slice ( & SINGLE_OBJECT_MAGIC ) ; // C3 01
55+ buf. extend_from_slice ( & val) ;
56+ buf
57+ }
58+ #[ cfg( feature = "sha256" ) ]
59+ Fingerprint :: SHA256 ( val) => {
60+ let mut buf = Vec :: with_capacity ( SINGLE_OBJECT_MAGIC . len ( ) + size_of_val ( & val) ) ;
61+ buf. extend_from_slice ( & SINGLE_OBJECT_MAGIC ) ; // C3 01
62+ buf. extend_from_slice ( & val) ;
63+ buf
64+ }
65+ }
4366}
4467
4568fn encode_records_with_prefix (
@@ -336,6 +359,27 @@ fn new_decoder(
336359 . expect ( "failed to build decoder" )
337360}
338361
362+ fn new_decoder_id (
363+ schema_json : & ' static str ,
364+ batch_size : usize ,
365+ utf8view : bool ,
366+ id : u32 ,
367+ ) -> arrow_avro:: reader:: Decoder {
368+ let schema = AvroSchema :: new ( schema_json. parse ( ) . unwrap ( ) ) ;
369+ let mut store = arrow_avro:: schema:: SchemaStore :: new_with_type ( FingerprintAlgorithm :: None ) ;
370+ // Register the schema with a provided Confluent-style ID
371+ store
372+ . set ( Fingerprint :: Id ( id) , schema. clone ( ) )
373+ . expect ( "failed to set schema with id" ) ;
374+ ReaderBuilder :: new ( )
375+ . with_writer_schema_store ( store)
376+ . with_active_fingerprint ( Fingerprint :: Id ( id) )
377+ . with_batch_size ( batch_size)
378+ . with_utf8_view ( utf8view)
379+ . build_decoder ( )
380+ . expect ( "failed to build decoder for id" )
381+ }
382+
339383const SIZES : [ usize ; 3 ] = [ 100 , 10_000 , 1_000_000 ] ;
340384
341385const INT_SCHEMA : & str =
@@ -373,7 +417,7 @@ macro_rules! dataset {
373417 static $name: Lazy <Vec <Vec <u8 >>> = Lazy :: new( || {
374418 let schema =
375419 ApacheSchema :: parse_str( $schema_json) . expect( "invalid schema for generator" ) ;
376- let arrow_schema = AvroSchema :: new( $schema_json. to_string ( ) ) ;
420+ let arrow_schema = AvroSchema :: new( $schema_json. parse ( ) . unwrap ( ) ) ;
377421 let fingerprint = arrow_schema. fingerprint( ) . expect( "fingerprint failed" ) ;
378422 let prefix = make_prefix( fingerprint) ;
379423 SIZES
@@ -384,6 +428,24 @@ macro_rules! dataset {
384428 } ;
385429}
386430
431+ /// Additional helper for Confluent's ID-based wire format (00 + BE u32).
432+ macro_rules! dataset_id {
433+ ( $name: ident, $schema_json: expr, $gen_fn: ident, $id: expr) => {
434+ static $name: Lazy <Vec <Vec <u8 >>> = Lazy :: new( || {
435+ let schema =
436+ ApacheSchema :: parse_str( $schema_json) . expect( "invalid schema for generator" ) ;
437+ let prefix = make_prefix( Fingerprint :: Id ( $id) ) ;
438+ SIZES
439+ . iter( )
440+ . map( |& n| $gen_fn( & schema, n, & prefix) )
441+ . collect( )
442+ } ) ;
443+ } ;
444+ }
445+
446+ const ID_BENCH_ID : u32 = 7 ;
447+
448+ dataset_id ! ( INT_DATA_ID , INT_SCHEMA , gen_int, ID_BENCH_ID ) ;
387449dataset ! ( INT_DATA , INT_SCHEMA , gen_int) ;
388450dataset ! ( LONG_DATA , LONG_SCHEMA , gen_long) ;
389451dataset ! ( FLOAT_DATA , FLOAT_SCHEMA , gen_float) ;
@@ -406,19 +468,20 @@ dataset!(ENUM_DATA, ENUM_SCHEMA, gen_enum);
406468dataset ! ( MIX_DATA , MIX_SCHEMA , gen_mixed) ;
407469dataset ! ( NEST_DATA , NEST_SCHEMA , gen_nested) ;
408470
409- fn bench_scenario (
471+ fn bench_with_decoder < F > (
410472 c : & mut Criterion ,
411473 name : & str ,
412- schema_json : & ' static str ,
413474 data_sets : & [ Vec < u8 > ] ,
414- utf8view : bool ,
415- batch_size : usize ,
416- ) {
475+ rows : & [ usize ] ,
476+ mut new_decoder : F ,
477+ ) where
478+ F : FnMut ( ) -> arrow_avro:: reader:: Decoder ,
479+ {
417480 let mut group = c. benchmark_group ( name) ;
418- for ( idx, & rows ) in SIZES . iter ( ) . enumerate ( ) {
481+ for ( idx, & row_count ) in rows . iter ( ) . enumerate ( ) {
419482 let datum = & data_sets[ idx] ;
420483 group. throughput ( Throughput :: Bytes ( datum. len ( ) as u64 ) ) ;
421- match rows {
484+ match row_count {
422485 10_000 => {
423486 group
424487 . sample_size ( 25 )
@@ -433,9 +496,9 @@ fn bench_scenario(
433496 }
434497 _ => { }
435498 }
436- group. bench_function ( BenchmarkId :: from_parameter ( rows ) , |b| {
499+ group. bench_function ( BenchmarkId :: from_parameter ( row_count ) , |b| {
437500 b. iter_batched_ref (
438- || new_decoder ( schema_json , batch_size , utf8view ) ,
501+ & mut new_decoder,
439502 |decoder| {
440503 black_box ( decoder. decode ( datum) . unwrap ( ) ) ;
441504 black_box ( decoder. flush ( ) . unwrap ( ) . unwrap ( ) ) ;
@@ -449,105 +512,75 @@ fn bench_scenario(
449512
450513fn criterion_benches ( c : & mut Criterion ) {
451514 for & batch_size in & [ SMALL_BATCH , LARGE_BATCH ] {
452- bench_scenario (
453- c,
454- "Interval" ,
455- INTERVAL_SCHEMA ,
456- & INTERVAL_DATA ,
457- false ,
458- batch_size,
459- ) ;
460- bench_scenario ( c, "Int32" , INT_SCHEMA , & INT_DATA , false , batch_size) ;
461- bench_scenario ( c, "Int64" , LONG_SCHEMA , & LONG_DATA , false , batch_size) ;
462- bench_scenario ( c, "Float32" , FLOAT_SCHEMA , & FLOAT_DATA , false , batch_size) ;
463- bench_scenario ( c, "Boolean" , BOOL_SCHEMA , & BOOL_DATA , false , batch_size) ;
464- bench_scenario ( c, "Float64" , DOUBLE_SCHEMA , & DOUBLE_DATA , false , batch_size) ;
465- bench_scenario (
466- c,
467- "Binary(Bytes)" ,
468- BYTES_SCHEMA ,
469- & BYTES_DATA ,
470- false ,
471- batch_size,
472- ) ;
473- bench_scenario ( c, "String" , STRING_SCHEMA , & STRING_DATA , false , batch_size) ;
474- bench_scenario (
475- c,
476- "StringView" ,
477- STRING_SCHEMA ,
478- & STRING_DATA ,
479- true ,
480- batch_size,
481- ) ;
482- bench_scenario ( c, "Date32" , DATE_SCHEMA , & DATE_DATA , false , batch_size) ;
483- bench_scenario (
484- c,
485- "TimeMillis" ,
486- TMILLIS_SCHEMA ,
487- & TMILLIS_DATA ,
488- false ,
489- batch_size,
490- ) ;
491- bench_scenario (
492- c,
493- "TimeMicros" ,
494- TMICROS_SCHEMA ,
495- & TMICROS_DATA ,
496- false ,
497- batch_size,
498- ) ;
499- bench_scenario (
500- c,
501- "TimestampMillis" ,
502- TSMILLIS_SCHEMA ,
503- & TSMILLIS_DATA ,
504- false ,
505- batch_size,
506- ) ;
507- bench_scenario (
508- c,
509- "TimestampMicros" ,
510- TSMICROS_SCHEMA ,
511- & TSMICROS_DATA ,
512- false ,
513- batch_size,
514- ) ;
515- bench_scenario ( c, "Map" , MAP_SCHEMA , & MAP_DATA , false , batch_size) ;
516- bench_scenario ( c, "Array" , ARRAY_SCHEMA , & ARRAY_DATA , false , batch_size) ;
517- bench_scenario (
518- c,
519- "Decimal128" ,
520- DECIMAL_SCHEMA ,
521- & DECIMAL_DATA ,
522- false ,
523- batch_size,
524- ) ;
525- bench_scenario ( c, "UUID" , UUID_SCHEMA , & UUID_DATA , false , batch_size) ;
526- bench_scenario (
527- c,
528- "FixedSizeBinary" ,
529- FIXED_SCHEMA ,
530- & FIXED_DATA ,
531- false ,
532- batch_size,
533- ) ;
534- bench_scenario (
535- c,
536- "Enum(Dictionary)" ,
537- ENUM_SCHEMA ,
538- & ENUM_DATA ,
539- false ,
540- batch_size,
541- ) ;
542- bench_scenario ( c, "Mixed" , MIX_SCHEMA , & MIX_DATA , false , batch_size) ;
543- bench_scenario (
544- c,
545- "Nested(Struct)" ,
546- NEST_SCHEMA ,
547- & NEST_DATA ,
548- false ,
549- batch_size,
550- ) ;
515+ bench_with_decoder ( c, "Interval" , & INTERVAL_DATA , & SIZES , || {
516+ new_decoder ( INTERVAL_SCHEMA , batch_size, false )
517+ } ) ;
518+ bench_with_decoder ( c, "Int32" , & INT_DATA , & SIZES , || {
519+ new_decoder ( INT_SCHEMA , batch_size, false )
520+ } ) ;
521+ bench_with_decoder ( c, "Int32_Id" , & INT_DATA_ID , & SIZES , || {
522+ new_decoder_id ( INT_SCHEMA , batch_size, false , ID_BENCH_ID )
523+ } ) ;
524+ bench_with_decoder ( c, "Int64" , & LONG_DATA , & SIZES , || {
525+ new_decoder ( LONG_SCHEMA , batch_size, false )
526+ } ) ;
527+ bench_with_decoder ( c, "Float32" , & FLOAT_DATA , & SIZES , || {
528+ new_decoder ( FLOAT_SCHEMA , batch_size, false )
529+ } ) ;
530+ bench_with_decoder ( c, "Boolean" , & BOOL_DATA , & SIZES , || {
531+ new_decoder ( BOOL_SCHEMA , batch_size, false )
532+ } ) ;
533+ bench_with_decoder ( c, "Float64" , & DOUBLE_DATA , & SIZES , || {
534+ new_decoder ( DOUBLE_SCHEMA , batch_size, false )
535+ } ) ;
536+ bench_with_decoder ( c, "Binary(Bytes)" , & BYTES_DATA , & SIZES , || {
537+ new_decoder ( BYTES_SCHEMA , batch_size, false )
538+ } ) ;
539+ bench_with_decoder ( c, "String" , & STRING_DATA , & SIZES , || {
540+ new_decoder ( STRING_SCHEMA , batch_size, false )
541+ } ) ;
542+ bench_with_decoder ( c, "StringView" , & STRING_DATA , & SIZES , || {
543+ new_decoder ( STRING_SCHEMA , batch_size, true )
544+ } ) ;
545+ bench_with_decoder ( c, "Date32" , & DATE_DATA , & SIZES , || {
546+ new_decoder ( DATE_SCHEMA , batch_size, false )
547+ } ) ;
548+ bench_with_decoder ( c, "TimeMillis" , & TMILLIS_DATA , & SIZES , || {
549+ new_decoder ( TMILLIS_SCHEMA , batch_size, false )
550+ } ) ;
551+ bench_with_decoder ( c, "TimeMicros" , & TMICROS_DATA , & SIZES , || {
552+ new_decoder ( TMICROS_SCHEMA , batch_size, false )
553+ } ) ;
554+ bench_with_decoder ( c, "TimestampMillis" , & TSMILLIS_DATA , & SIZES , || {
555+ new_decoder ( TSMILLIS_SCHEMA , batch_size, false )
556+ } ) ;
557+ bench_with_decoder ( c, "TimestampMicros" , & TSMICROS_DATA , & SIZES , || {
558+ new_decoder ( TSMICROS_SCHEMA , batch_size, false )
559+ } ) ;
560+ bench_with_decoder ( c, "Map" , & MAP_DATA , & SIZES , || {
561+ new_decoder ( MAP_SCHEMA , batch_size, false )
562+ } ) ;
563+ bench_with_decoder ( c, "Array" , & ARRAY_DATA , & SIZES , || {
564+ new_decoder ( ARRAY_SCHEMA , batch_size, false )
565+ } ) ;
566+ bench_with_decoder ( c, "Decimal128" , & DECIMAL_DATA , & SIZES , || {
567+ new_decoder ( DECIMAL_SCHEMA , batch_size, false )
568+ } ) ;
569+ bench_with_decoder ( c, "UUID" , & UUID_DATA , & SIZES , || {
570+ new_decoder ( UUID_SCHEMA , batch_size, false )
571+ } ) ;
572+ bench_with_decoder ( c, "FixedSizeBinary" , & FIXED_DATA , & SIZES , || {
573+ new_decoder ( FIXED_SCHEMA , batch_size, false )
574+ } ) ;
575+ bench_with_decoder ( c, "Enum(Dictionary)" , & ENUM_DATA , & SIZES , || {
576+ new_decoder ( ENUM_SCHEMA , batch_size, false )
577+ } ) ;
578+ bench_with_decoder ( c, "Mixed" , & MIX_DATA , & SIZES , || {
579+ new_decoder ( MIX_SCHEMA , batch_size, false )
580+ } ) ;
581+ bench_with_decoder ( c, "Nested(Struct)" , & NEST_DATA , & SIZES , || {
582+ new_decoder ( NEST_SCHEMA , batch_size, false )
583+ } ) ;
551584 }
552585}
553586
0 commit comments