1515// specific language governing permissions and limitations
1616// under the License.
1717
18+ use std:: collections:: HashMap ;
1819use std:: fmt:: Debug ;
19- use std:: sync:: Arc ;
20+ use std:: sync:: { Arc , RwLock } ;
2021
2122use arrow:: compute:: SortOptions ;
2223use arrow:: datatypes:: { IntervalMonthDayNanoType , Schema , SchemaRef } ;
@@ -135,7 +136,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
135136 self . try_into_physical_plan_with_converter (
136137 ctx,
137138 codec,
138- & DefaultPhysicalProtoConverter { } ,
139+ & DefaultPhysicalProtoConverter :: new ( ) ,
139140 )
140141 }
141142
@@ -149,7 +150,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
149150 Self :: try_from_physical_plan_with_converter (
150151 plan,
151152 codec,
152- & DefaultPhysicalProtoConverter { } ,
153+ & DefaultPhysicalProtoConverter :: new ( ) ,
153154 )
154155 }
155156}
@@ -2991,6 +2992,7 @@ impl protobuf::PhysicalPlanNode {
29912992 nulls_first : expr. options . nulls_first ,
29922993 } ) ;
29932994 Ok ( protobuf:: PhysicalExprNode {
2995+ expr_arc_id : None ,
29942996 expr_type : Some ( ExprType :: Sort ( sort_expr) ) ,
29952997 } )
29962998 } )
@@ -3076,6 +3078,7 @@ impl protobuf::PhysicalPlanNode {
30763078 nulls_first : expr. options . nulls_first ,
30773079 } ) ;
30783080 Ok ( protobuf:: PhysicalExprNode {
3081+ expr_arc_id : None ,
30793082 expr_type : Some ( ExprType :: Sort ( sort_expr) ) ,
30803083 } )
30813084 } )
@@ -3661,7 +3664,44 @@ struct DataEncoderTuple {
36613664 pub blob : Vec < u8 > ,
36623665}
36633666
3664- pub struct DefaultPhysicalProtoConverter ;
3667+ /// Default implementation of [`PhysicalProtoConverterExtension`] that provides
3668+ /// expression deduplication during deserialization.
3669+ ///
3670+ /// During serialization, the Arc pointer address of each expression is embedded
3671+ /// in the protobuf as `expr_arc_id`. During deserialization, if an expression
3672+ /// with the same `expr_arc_id` has been seen before, the cached Arc is returned
3673+ /// instead of creating a new one. This enables expression sharing and can
3674+ /// significantly reduce memory usage for plans with duplicate expressions
3675+ /// (e.g., large IN lists).
3676+ ///
3677+ /// # Important: Scope of Deduplication
3678+ ///
3679+ /// The `expr_arc_id` is only valid as a deduplication key **within a single
3680+ /// serialized plan from a single process**. Arc pointer addresses can collide:
3681+ /// - Different processes may allocate Arcs at the same address
3682+ /// - The same process may reuse addresses after deallocation
3683+ ///
3684+ /// Therefore, you **must create a fresh `DefaultPhysicalProtoConverter` instance
3685+ /// for each plan you deserialize**. Do not reuse the same converter instance
3686+ /// across multiple plans from different sources, as this could incorrectly
3687+ /// deduplicate unrelated expressions that happen to share the same pointer address.
3688+ #[ derive( Default ) ]
3689+ pub struct DefaultPhysicalProtoConverter {
3690+ /// Cache for expression deduplication during deserialization.
3691+ /// Maps expr_arc_id (the original Arc pointer address) to the deserialized expression.
3692+ ///
3693+ /// This cache should only be used for a single plan deserialization.
3694+ /// Create a new converter instance for each plan to avoid cross-plan collisions.
3695+ dedup_cache : RwLock < HashMap < u64 , Arc < dyn PhysicalExpr > > > ,
3696+ }
3697+
3698+ impl DefaultPhysicalProtoConverter {
3699+ /// Creates a new `DefaultPhysicalProtoConverter` with an empty dedup cache.
3700+ pub fn new ( ) -> Self {
3701+ Self :: default ( )
3702+ }
3703+ }
3704+
36653705impl PhysicalProtoConverterExtension for DefaultPhysicalProtoConverter {
36663706 fn proto_to_execution_plan (
36673707 & self ,
@@ -3697,16 +3737,48 @@ impl PhysicalProtoConverterExtension for DefaultPhysicalProtoConverter {
36973737 where
36983738 Self : Sized ,
36993739 {
3700- // Default implementation calls the free function
3701- parse_physical_expr_with_converter ( proto, ctx, input_schema, codec, self )
3740+ // Check if we've seen this expr_arc_id before (deduplication)
3741+ if let Some ( arc_id) = proto. expr_arc_id {
3742+ // Try to get from cache first
3743+ {
3744+ let cache = self . dedup_cache . read ( ) . unwrap ( ) ;
3745+ if let Some ( cached) = cache. get ( & arc_id) {
3746+ return Ok ( Arc :: clone ( cached) ) ;
3747+ }
3748+ }
3749+
3750+ // Not in cache, deserialize the expression
3751+ let expr = parse_physical_expr_with_converter (
3752+ proto,
3753+ ctx,
3754+ input_schema,
3755+ codec,
3756+ self ,
3757+ ) ?;
3758+
3759+ // Cache it for future lookups
3760+ {
3761+ let mut cache = self . dedup_cache . write ( ) . unwrap ( ) ;
3762+ cache. insert ( arc_id, Arc :: clone ( & expr) ) ;
3763+ }
3764+
3765+ Ok ( expr)
3766+ } else {
3767+ // No arc_id, just deserialize normally (backward compatibility)
3768+ parse_physical_expr_with_converter ( proto, ctx, input_schema, codec, self )
3769+ }
37023770 }
37033771
37043772 fn physical_expr_to_proto (
37053773 & self ,
37063774 expr : & Arc < dyn PhysicalExpr > ,
37073775 codec : & dyn PhysicalExtensionCodec ,
37083776 ) -> Result < protobuf:: PhysicalExprNode > {
3709- serialize_physical_expr_with_converter ( expr, codec, self )
3777+ let mut proto = serialize_physical_expr_with_converter ( expr, codec, self ) ?;
3778+ // Set the expr_arc_id to the Arc pointer address for deduplication
3779+ // Cast through a thin pointer to get a unique identifier for this Arc
3780+ proto. expr_arc_id = Some ( Arc :: as_ptr ( expr) as * const ( ) as u64 ) ;
3781+ Ok ( proto)
37103782 }
37113783}
37123784
0 commit comments