quickwit-oss
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benches/agg_bench.rs‎
Lines changed: 17 additions & 0 deletions b/‎benches/agg_bench.rs‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎columnar/src/block_accessor.rs‎
Lines changed: 3 additions & 2 deletions b/‎columnar/src/block_accessor.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/aggregation/bucket/composite/collector.rs‎
Lines changed: 5 additions & 5 deletions b/‎src/aggregation/bucket/composite/collector.rs‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/aggregation/bucket/composite/mod.rs‎
Lines changed: 4 additions & 4 deletions b/‎src/aggregation/bucket/composite/mod.rs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/aggregation/bucket/filter.rs‎
Lines changed: 10 additions & 10 deletions b/‎src/aggregation/bucket/filter.rs‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/aggregation/bucket/histogram/histogram.rs‎
Lines changed: 4 additions & 4 deletions b/‎src/aggregation/bucket/histogram/histogram.rs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/aggregation/bucket/range.rs‎
Lines changed: 13 additions & 12 deletions b/‎src/aggregation/bucket/range.rs‎
Lines changed: 13 additions & 12 deletions
@@ -24,6 +24,7 @@ regex = { version = "1.5.5", default-features = false, features = [
     "std",
     "unicode",
 ] }
+murmurhash32 = "0.3"
 aho-corasick = "1.0"
 tantivy-fst = "0.5"
 memmap2 = { version = "0.9.0", optional = true }
@@ -65,7 +66,7 @@ tantivy-bitpacker = { version = "0.10", path = "./bitpacker" }
 common = { version = "0.11", path = "./common/", package = "tantivy-common" }
 tokenizer-api = { version = "0.7", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
 sketches-ddsketch = { version = "0.4", features = ["use_serde"] }
-datasketches = "0.2.0"
+datasketches = { git = "https://github.com/fulmicoton-dd/datasketches-rust", rev = "eb4ad64" }
 futures-util = { version = "0.3.28", optional = true }
 futures-channel = { version = "0.3.28", optional = true }
 fnv = "1.0.7"
 
@@ -78,6 +78,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
 
     register!(group, cardinality_agg);
     register!(group, terms_status_with_cardinality_agg);
+    register!(group, terms_100_buckets_with_cardinality_agg);
 
     register!(group, range_agg);
     register!(group, range_agg_with_avg_sub_agg);
@@ -169,6 +170,22 @@ fn terms_status_with_cardinality_agg(index: &Index) {
     let agg_req = json!({
         "my_texts": {
             "terms": { "field": "text_few_terms_status" },
+            "aggs": {
+                "cardinality": {
+                    "cardinality": {
+                        "field": "text_few_terms_status"
+                    },
+                }
+            }
+        },
+    });
+    execute_agg(index, agg_req);
+}
+
+fn terms_100_buckets_with_cardinality_agg(index: &Index) {
+    let agg_req = json!({
+        "my_texts": {
+            "terms": { "field": "text_1000_terms_zipf", "size": 100 },
             "aggs": {
                 "cardinality": {
                     "cardinality": {
 
@@ -33,14 +33,14 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
         &mut self,
         docs: &[u32],
         accessor: &Column<T>,
-        missing: Option<T>,
+        missing_opt: Option<T>,
     ) {
         self.fetch_block(docs, accessor);
         // no missing values
         if accessor.index.get_cardinality().is_full() {
             return;
         }
-        let Some(missing) = missing else {
+        let Some(missing) = missing_opt else {
             return;
         };
 
@@ -191,6 +191,7 @@ where F: FnMut(u32) {
 }
 
 #[cfg(test)]
+#[allow(clippy::field_reassign_with_default)]
 mod tests {
     use super::*;
 
 
@@ -21,7 +21,7 @@ use crate::aggregation::bucket::composite::map::{DynArrayHeapMap, MAX_DYN_ARRAY_
 use crate::aggregation::bucket::{
     CalendarInterval, CompositeAggregationSource, MissingOrder, Order,
 };
-use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardSubAggCache};
+use crate::aggregation::buffered_sub_aggs::{BufferedSubAggs, HighCardSubAggBuffer};
 use crate::aggregation::intermediate_agg_result::{
     CompositeIntermediateKey, IntermediateAggregationResult, IntermediateAggregationResults,
     IntermediateBucketResult, IntermediateCompositeBucketEntry, IntermediateCompositeBucketResult,
@@ -119,7 +119,7 @@ pub struct SegmentCompositeCollector {
     /// One DynArrayHeapMap per parent bucket.
     parent_buckets: Vec<DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>>,
     accessor_idx: usize,
-    sub_agg: Option<CachedSubAggs<HighCardSubAggCache>>,
+    sub_agg: Option<BufferedSubAggs<HighCardSubAggBuffer>>,
     bucket_id_provider: BucketIdProvider,
     /// Number of sources, needed when creating new DynArrayHeapMaps.
     num_sources: usize,
@@ -218,7 +218,7 @@ impl SegmentCompositeCollector {
         let has_sub_aggregations = !node.children.is_empty();
         let sub_agg = if has_sub_aggregations {
             let sub_agg_collector = build_segment_agg_collectors(req_data, &node.children)?;
-            Some(CachedSubAggs::new(sub_agg_collector))
+            Some(BufferedSubAggs::new(sub_agg_collector))
         } else {
             None
         };
@@ -332,7 +332,7 @@ fn collect_bucket_with_limit(
     limit_num_buckets: usize,
     buckets: &mut DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>,
     key: &[InternalValueRepr],
-    sub_agg: &mut Option<CachedSubAggs<HighCardSubAggCache>>,
+    sub_agg: &mut Option<BufferedSubAggs<HighCardSubAggBuffer>>,
     bucket_id_provider: &mut BucketIdProvider,
 ) {
     let mut record_in_bucket = |bucket: &mut CompositeBucketCollector| {
@@ -488,7 +488,7 @@ struct CompositeKeyVisitor<'a> {
     doc_id: crate::DocId,
     composite_agg_data: &'a CompositeAggReqData,
     buckets: &'a mut DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>,
-    sub_agg: &'a mut Option<CachedSubAggs<HighCardSubAggCache>>,
+    sub_agg: &'a mut Option<BufferedSubAggs<HighCardSubAggBuffer>>,
     bucket_id_provider: &'a mut BucketIdProvider,
     sub_level_values: SmallVec<[InternalValueRepr; MAX_DYN_ARRAY_SIZE]>,
 }
 
@@ -511,14 +511,14 @@ mod tests {
 
     fn datetime_from_iso_str(date_str: &str) -> common::DateTime {
         let dt = OffsetDateTime::parse(date_str, &Rfc3339)
-            .expect(&format!("Failed to parse date: {}", date_str));
+            .unwrap_or_else(|_| panic!("Failed to parse date: {}", date_str));
         let timestamp_secs = dt.unix_timestamp_nanos();
         common::DateTime::from_timestamp_nanos(timestamp_secs as i64)
     }
 
     fn ms_timestamp_from_iso_str(date_str: &str) -> i64 {
         let dt = OffsetDateTime::parse(date_str, &Rfc3339)
-            .expect(&format!("Failed to parse date: {}", date_str));
+            .unwrap_or_else(|_| panic!("Failed to parse date: {}", date_str));
         (dt.unix_timestamp_nanos() / 1_000_000) as i64
     }
 
@@ -548,7 +548,7 @@ mod tests {
                     agg_req_json["my_composite"]["composite"]["after"] = after_key.take().unwrap();
                 }
                 let agg_req: Aggregations = serde_json::from_value(agg_req_json).unwrap();
-                let res = exec_request(agg_req.clone(), &index).unwrap();
+                let res = exec_request(agg_req.clone(), index).unwrap();
                 let expected_page_buckets = &expected_buckets_vec[page_idx * page_size
                     ..std::cmp::min((page_idx + 1) * page_size, expected_buckets_vec.len())];
                 assert_eq!(
@@ -578,7 +578,7 @@ mod tests {
                         }
                     });
                     let agg_req: Aggregations = serde_json::from_value(agg_req_json).unwrap();
-                    let res = exec_request(agg_req.clone(), &index).unwrap();
+                    let res = exec_request(agg_req.clone(), index).unwrap();
                     assert_eq!(
                         res["my_composite"]["buckets"],
                         json!([]),
 
@@ -6,8 +6,8 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use crate::aggregation::agg_data::{
     build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
 };
-use crate::aggregation::cached_sub_aggs::{
-    CachedSubAggs, HighCardSubAggCache, LowCardSubAggCache, SubAggCache,
+use crate::aggregation::buffered_sub_aggs::{
+    BufferedSubAggs, HighCardSubAggBuffer, LowCardSubAggBuffer, SubAggBuffer,
 };
 use crate::aggregation::intermediate_agg_result::{
     IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
@@ -503,17 +503,17 @@ struct DocCount {
 }
 
 /// Segment collector for filter aggregation
-pub struct SegmentFilterCollector<C: SubAggCache> {
+pub struct SegmentFilterCollector<B: SubAggBuffer> {
     /// Document counts per parent bucket
     parent_buckets: Vec<DocCount>,
     /// Sub-aggregation collectors
-    sub_aggregations: Option<CachedSubAggs<C>>,
+    sub_aggregations: Option<BufferedSubAggs<B>>,
     bucket_id_provider: BucketIdProvider,
     /// Accessor index for this filter aggregation (to access FilterAggReqData)
     accessor_idx: usize,
 }
 
-impl<C: SubAggCache> SegmentFilterCollector<C> {
+impl<B: SubAggBuffer> SegmentFilterCollector<B> {
     /// Create a new filter segment collector following the new agg_data pattern
     pub(crate) fn from_req_and_validate(
         req: &mut AggregationsSegmentCtx,
@@ -525,7 +525,7 @@ impl<C: SubAggCache> SegmentFilterCollector<C> {
         } else {
             None
         };
-        let sub_agg_collector = sub_agg_collector.map(CachedSubAggs::new);
+        let sub_agg_collector = sub_agg_collector.map(BufferedSubAggs::new);
 
         Ok(SegmentFilterCollector {
             parent_buckets: Vec::new(),
@@ -547,16 +547,16 @@ pub(crate) fn build_segment_filter_collector(
 
     if is_top_level {
         Ok(Box::new(
-            SegmentFilterCollector::<LowCardSubAggCache>::from_req_and_validate(req, node)?,
+            SegmentFilterCollector::<LowCardSubAggBuffer>::from_req_and_validate(req, node)?,
         ))
     } else {
         Ok(Box::new(
-            SegmentFilterCollector::<HighCardSubAggCache>::from_req_and_validate(req, node)?,
+            SegmentFilterCollector::<HighCardSubAggBuffer>::from_req_and_validate(req, node)?,
         ))
     }
 }
 
-impl<C: SubAggCache> Debug for SegmentFilterCollector<C> {
+impl<B: SubAggBuffer> Debug for SegmentFilterCollector<B> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("SegmentFilterCollector")
             .field("buckets", &self.parent_buckets)
@@ -566,7 +566,7 @@ impl<C: SubAggCache> Debug for SegmentFilterCollector<C> {
     }
 }
 
-impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
+impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B> {
     fn add_intermediate_aggregation_result(
         &mut self,
         agg_data: &AggregationsSegmentCtx,
 
@@ -10,7 +10,7 @@ use crate::aggregation::agg_data::{
 };
 use crate::aggregation::agg_req::Aggregations;
 use crate::aggregation::agg_result::BucketEntry;
-use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
+use crate::aggregation::buffered_sub_aggs::{BufferedSubAggs, HighCardBufferedSubAggs};
 use crate::aggregation::intermediate_agg_result::{
     IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
     IntermediateHistogramBucketEntry,
@@ -258,7 +258,7 @@ pub(crate) struct SegmentHistogramBucketEntry {
 impl SegmentHistogramBucketEntry {
     pub(crate) fn into_intermediate_bucket_entry(
         self,
-        sub_aggregation: &mut Option<HighCardCachedSubAggs>,
+        sub_aggregation: &mut Option<HighCardBufferedSubAggs>,
         agg_data: &AggregationsSegmentCtx,
     ) -> crate::Result<IntermediateHistogramBucketEntry> {
         let mut sub_aggregation_res = IntermediateAggregationResults::default();
@@ -291,7 +291,7 @@ pub struct SegmentHistogramCollector {
     /// The buckets containing the aggregation data.
     /// One Histogram bucket per parent bucket id.
     parent_buckets: Vec<HistogramBuckets>,
-    sub_agg: Option<HighCardCachedSubAggs>,
+    sub_agg: Option<HighCardBufferedSubAggs>,
     accessor_idx: usize,
     bucket_id_provider: BucketIdProvider,
 }
@@ -444,7 +444,7 @@ impl SegmentHistogramCollector {
             max: f64::MAX,
         });
         req_data.offset = req_data.req.offset.unwrap_or(0.0);
-        let sub_agg = sub_agg.map(CachedSubAggs::new);
+        let sub_agg = sub_agg.map(BufferedSubAggs::new);
 
         Ok(Self {
             parent_buckets: Default::default(),
 
@@ -9,8 +9,9 @@ use crate::aggregation::agg_data::{
     build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
 };
 use crate::aggregation::agg_limits::AggregationLimitsGuard;
-use crate::aggregation::cached_sub_aggs::{
-    CachedSubAggs, HighCardSubAggCache, LowCardCachedSubAggs, LowCardSubAggCache, SubAggCache,
+use crate::aggregation::buffered_sub_aggs::{
+    BufferedSubAggs, HighCardSubAggBuffer, LowCardBufferedSubAggs, LowCardSubAggBuffer,
+    SubAggBuffer,
 };
 use crate::aggregation::intermediate_agg_result::{
     IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
@@ -155,13 +156,13 @@ pub(crate) struct SegmentRangeAndBucketEntry {
 
 /// The collector puts values from the fast field into the correct buckets and does a conversion to
 /// the correct datatype.
-pub struct SegmentRangeCollector<C: SubAggCache> {
+pub struct SegmentRangeCollector<B: SubAggBuffer> {
     /// The buckets containing the aggregation data.
     /// One for each ParentBucketId
     parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
     column_type: ColumnType,
     pub(crate) accessor_idx: usize,
-    sub_agg: Option<CachedSubAggs<C>>,
+    sub_agg: Option<BufferedSubAggs<B>>,
     /// Here things get a bit weird. We need to assign unique bucket ids across all
     /// parent buckets. So we keep track of the next available bucket id here.
     /// This allows a kind of flattening of the bucket ids across all parent buckets.
@@ -178,7 +179,7 @@ pub struct SegmentRangeCollector<C: SubAggCache> {
     limits: AggregationLimitsGuard,
 }
 
-impl<C: SubAggCache> Debug for SegmentRangeCollector<C> {
+impl<B: SubAggBuffer> Debug for SegmentRangeCollector<B> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("SegmentRangeCollector")
             .field("parent_buckets_len", &self.parent_buckets.len())
@@ -229,7 +230,7 @@ impl SegmentRangeBucketEntry {
     }
 }
 
-impl<C: SubAggCache> SegmentAggregationCollector for SegmentRangeCollector<C> {
+impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
     fn add_intermediate_aggregation_result(
         &mut self,
         agg_data: &AggregationsSegmentCtx,
@@ -350,17 +351,17 @@ pub(crate) fn build_segment_range_collector(
     };
 
     if is_low_card {
-        Ok(Box::new(SegmentRangeCollector::<LowCardSubAggCache> {
-            sub_agg: sub_agg.map(LowCardCachedSubAggs::new),
+        Ok(Box::new(SegmentRangeCollector::<LowCardSubAggBuffer> {
+            sub_agg: sub_agg.map(LowCardBufferedSubAggs::new),
             column_type: field_type,
             accessor_idx,
             parent_buckets: Vec::new(),
             bucket_id_provider: BucketIdProvider::default(),
             limits: agg_data.context.limits.clone(),
         }))
     } else {
-        Ok(Box::new(SegmentRangeCollector::<HighCardSubAggCache> {
-            sub_agg: sub_agg.map(CachedSubAggs::new),
+        Ok(Box::new(SegmentRangeCollector::<HighCardSubAggBuffer> {
+            sub_agg: sub_agg.map(BufferedSubAggs::new),
             column_type: field_type,
             accessor_idx,
             parent_buckets: Vec::new(),
@@ -370,7 +371,7 @@ pub(crate) fn build_segment_range_collector(
     }
 }
 
-impl<C: SubAggCache> SegmentRangeCollector<C> {
+impl<B: SubAggBuffer> SegmentRangeCollector<B> {
     pub(crate) fn create_new_buckets(
         &mut self,
         agg_data: &AggregationsSegmentCtx,
@@ -554,7 +555,7 @@ mod tests {
     pub fn get_collector_from_ranges(
         ranges: Vec<RangeAggregationRange>,
         field_type: ColumnType,
-    ) -> SegmentRangeCollector<HighCardSubAggCache> {
+    ) -> SegmentRangeCollector<HighCardSubAggBuffer> {
         let req = RangeAggregation {
             field: "dummy".to_string(),
             ranges,