Prune GPU sketch after each merge (#12105)

RAMitchell · web-flow · commit 4b81e2fb72fd · 2026-03-22T20:55:20.000+01:00
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
@@ -46,6 +46,7 @@ size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns, size_t
 size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz, size_t num_bins,
                       bool with_weights) {
   size_t peak = 0;
+  auto cuts_bytes = RequiredSampleCuts(num_rows, num_bins, num_bins, nnz) * sizeof(SketchEntry);
   // 0. Allocate cut pointer in quantile container by increasing: n_columns + 1
   size_t total = (num_columns + 1) * sizeof(SketchContainer::OffsetT);
   // 1. Copy and sort: 2 * bytes_per_element * shape
@@ -58,16 +59,22 @@ size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
   // 4. Allocate cut pointer by increasing: n_columns + 1
   total += (num_columns + 1) * sizeof(SketchContainer::OffsetT);
   // 5. Allocate cuts: assuming rows is greater than bins: n_columns * limit_size
-  total += RequiredSampleCuts(num_rows, num_bins, num_bins, nnz) * sizeof(SketchEntry);
-  // 6. Deallocate copied entries by reducing: bytes_per_element * shape.
+  total += cuts_bytes;
+  // 6. Install the first batch summary into the resident sketch while the temporary pruned
+  // summary is still live.
+  total += cuts_bytes;
+  // 7. Deallocate copied entries by reducing: bytes_per_element * shape.
   peak = std::max(peak, total);
   total -= (BytesPerElement(with_weights) * num_rows * num_columns) / 2;
-  // 7. Deallocate column size scan.
+  // 8. Deallocate the temporary pruned batch summary after merge/prune commit.
+  peak = std::max(peak, total);
+  total -= cuts_bytes;
+  // 9. Deallocate column size scan.
   peak = std::max(peak, total);
   total -= (num_columns + 1) * sizeof(SketchContainer::OffsetT);
-  // 8. Deallocate cut size scan.
+  // 10. Deallocate cut size scan.
   total -= (num_columns + 1) * sizeof(SketchContainer::OffsetT);
-  // 9. Allocate final cut values and cut ptrs: std::min(rows, bins + 1) * n_columns +
+  // 11. Allocate final cut values and cut ptrs: std::min(rows, bins + 1) * n_columns +
   //    n_columns + 1
   total += std::min(num_rows, num_bins) * num_columns * sizeof(float);
   total +=
@@ -269,8 +276,9 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
   CHECK_EQ(d_cuts_ptr.size(), column_sizes_scan.size());
 
   // Add cuts into sketches
+  auto approx_n_samples = std::max<bst_idx_t>(1, (end - begin + info.num_col_ - 1) / info.num_col_);
   sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
-                         h_cuts_ptr.back(), dh::ToSpan(entry_weight));
+                         h_cuts_ptr.back(), approx_n_samples, dh::ToSpan(entry_weight));
 
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
@@ -278,7 +278,8 @@ inline HistogramCuts DeviceSketch(
 template <typename AdapterBatch>
 void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInfo const& info,
                           size_t n_features, size_t begin, size_t end, float missing,
-                          SketchContainer* sketch_container, int num_cuts) {
+                          SketchContainer* sketch_container, int num_cuts,
+                          bst_idx_t approx_n_samples) {
   // Copy current subset of valid elements into temporary storage and sort
   dh::device_vector<Entry> sorted_entries;
   dh::caching_device_vector<size_t> column_sizes_scan;
@@ -303,7 +304,7 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
   auto const& h_cuts_ptr = cuts_ptr.HostVector();
   // Extract the cuts from all columns concurrently
   sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
-                         h_cuts_ptr.back());
+                         h_cuts_ptr.back(), approx_n_samples);
 
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
@@ -313,7 +314,7 @@ template <typename Batch>
 void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo const& info,
                                   int num_cuts_per_feature, bool is_ranking, float missing,
                                   size_t columns, size_t begin, size_t end,
-                                  SketchContainer* sketch_container) {
+                                  SketchContainer* sketch_container, bst_idx_t approx_n_samples) {
   curt::SetDevice(ctx->Ordinal());
   info.weights_.SetDevice(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
@@ -379,7 +380,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
 
   // Extract cuts
   sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
-                         h_cuts_ptr.back(), dh::ToSpan(temp_weights));
+                         h_cuts_ptr.back(), approx_n_samples, dh::ToSpan(temp_weights));
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
 }
@@ -431,10 +432,10 @@ void AdapterDeviceSketch(Context const* ctx, Batch batch, bst_bin_t num_bins, Me
     if (weighted) {
       ProcessWeightedSlidingWindow(ctx, batch, info, num_cuts_per_feature,
                                    HostSketchContainer::UseGroup(info), missing, num_cols, begin,
-                                   end, sketch_container);
+                                   end, sketch_container, approx_n_samples);
     } else {
       ProcessSlidingWindow(ctx, batch, info, num_cols, begin, end, missing, sketch_container,
-                           num_cuts_per_feature);
+                           num_cuts_per_feature, approx_n_samples);
     }
     begin += sketch_batch_num_elements;
   }
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
@@ -301,21 +301,17 @@ void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
   });
 }
 
+// Convert one sorted batch into a temporary pruned summary in `prune_buffer_`, normalize
+// duplicated raw values in place, then merge that summary into the resident sketch in
+// `entries_`. Out-of-place merge/prune results use `entries_tmp_` as scratch before being
+// committed back into `entries_`.
 void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
-                           common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
+                           common::Span<OffsetT> cuts_ptr, size_t total_cuts,
+                           bst_idx_t n_rows_in_batch, Span<float> weights) {
   curt::SetDevice(ctx->Ordinal());
-  auto &current = this->entries_;
-  auto &columns_ptr_out = this->columns_ptr_;
-  Span<SketchEntry> out;
-  dh::device_vector<SketchEntry> cuts;
-  bool first_window = current.empty();
-  if (!first_window) {
-    cuts.resize(total_cuts);
-    out = dh::ToSpan(cuts);
-  } else {
-    current.resize(total_cuts);
-    out = dh::ToSpan(current);
-  }
+  rows_seen_ += n_rows_in_batch;
+  this->prune_buffer_.resize(total_cuts);
+  auto out = dh::ToSpan(this->prune_buffer_);
   auto ft = this->feature_types_.ConstDeviceSpan();
   if (weights.empty()) {
     auto to_sketch_entry = [] __device__(size_t sample_idx, Span<Entry const> const &column,
@@ -340,19 +336,13 @@ void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<s
     PruneImpl<Entry>(cuts_ptr, entries, columns_ptr, ft, out, to_sketch_entry);
   }
   auto n_uniques = this->ScanInput(ctx, out, cuts_ptr);
-
-  if (!first_window) {
-    CHECK_EQ(columns_ptr_out.Size(), cuts_ptr.size());
-    out = out.subspan(0, n_uniques);
-    this->Merge(ctx, cuts_ptr, out);
-  } else {
-    current.resize(n_uniques);
-    columns_ptr_out.SetDevice(ctx->Device());
-    columns_ptr_out.Resize(cuts_ptr.size());
-
-    auto d_cuts_ptr = columns_ptr_out.DeviceSpan();
-    CopyTo(d_cuts_ptr, cuts_ptr);
+  CHECK_EQ(this->columns_ptr_.Size(), cuts_ptr.size());
+  if (n_uniques == 0) {
+    return;
   }
+  this->Merge(ctx, cuts_ptr, out.subspan(0, n_uniques));
+  auto intermediate_num_cuts = static_cast<bst_idx_t>(this->IntermediateNumCuts());
+  this->Prune(ctx, intermediate_num_cuts);
 }
 
 size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
@@ -404,6 +394,11 @@ void SketchContainer::Prune(Context const *ctx, std::size_t to) {
   auto &columns_ptr_tmp = this->columns_ptr_tmp_;
   auto const &feature_types = this->feature_types_;
 
+  if (entries.size() <= to * num_columns_) {
+    timer_.Stop(__func__);
+    return;
+  }
+
   OffsetT to_total = 0;
   auto &h_columns_ptr = columns_ptr_tmp.HostVector();
   h_columns_ptr[0] = to_total;
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
@@ -6,6 +6,7 @@
 
 #include <thrust/logical.h>  // for any_of
 
+#include <algorithm>
 #include <cstddef>     // for size_t
 #include <functional>  // for equal_to
 
@@ -52,17 +53,25 @@ class SketchContainer {
   // The container is just a CSC matrix plus scratch storage for out-of-place transforms.
   dh::device_vector<SketchEntry> entries_;
   dh::device_vector<SketchEntry> entries_tmp_;
+  dh::device_vector<SketchEntry> prune_buffer_;
   HostDeviceVector<OffsetT> columns_ptr_;
   HostDeviceVector<OffsetT> columns_ptr_tmp_;
 
   bool has_categorical_{false};
+  std::size_t rows_seen_{0};
 
   void SetCurrentColumns(Span<OffsetT const> columns_ptr);
   void CommitScratch(std::size_t n_entries) {
     entries_.swap(entries_tmp_);
     columns_ptr_.Copy(columns_ptr_tmp_);
     entries_.resize(n_entries);
   }
+  [[nodiscard]] std::size_t IntermediateNumCuts() const {
+    auto const base = static_cast<std::size_t>(num_bins_) * kFactor;
+    auto const eps = 1.0 / static_cast<double>(base);
+    auto const per_feature = WQSketch::LimitSizeLevel(std::max<std::size_t>(1, rows_seen_), eps);
+    return per_feature * num_columns_;
+  }
 
   // Get the span of one column.
   Span<SketchEntry> Column(bst_feature_t i) {
@@ -109,15 +118,18 @@ class SketchContainer {
    */
   [[nodiscard]] std::size_t MemCapacityBytes() const {
     auto constexpr kE = sizeof(typename decltype(this->entries_)::value_type);
-    auto n_bytes = (this->entries_.capacity() + this->entries_tmp_.capacity()) * kE;
+    auto n_bytes =
+        (this->entries_.capacity() + this->entries_tmp_.capacity() + this->prune_buffer_.capacity()) *
+        kE;
     n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_tmp_.Size()) * sizeof(OffsetT);
     n_bytes += this->feature_types_.Size() * sizeof(FeatureType);
 
     return n_bytes;
   }
   [[nodiscard]] std::size_t MemCostBytes() const {
     auto constexpr kE = sizeof(typename decltype(this->entries_)::value_type);
-    auto n_bytes = (this->entries_.size() + this->entries_tmp_.size()) * kE;
+    auto n_bytes =
+        (this->entries_.size() + this->entries_tmp_.size() + this->prune_buffer_.size()) * kE;
     n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_tmp_.Size()) * sizeof(OffsetT);
     n_bytes += this->feature_types_.Size() * sizeof(FeatureType);
 
@@ -140,7 +152,8 @@ class SketchContainer {
    * \param weights (optional) data weights.
    */
   void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
-            common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
+            common::Span<OffsetT> cuts_ptr, size_t total_cuts, bst_idx_t n_rows_in_batch,
+            Span<float> weights = {});
   /**
    * @brief Prune the quantile structure.
    *
diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h
@@ -111,7 +111,11 @@ inline void TestRank(const std::vector<float>& column_cuts, const std::vector<fl
       j++;
     }
     double expected_rank = ((i + 1) * total_weight) / column_cuts.size();
-    double acceptable_error = std::max(2.9, total_weight * eps);
+    // For small sketches, a purely relative tolerance can be tighter than one bin's
+    // expected mass. Use the larger of the relative tolerance and the average per-cut
+    // mass instead of a hard-coded floor.
+    double acceptable_error =
+        std::max(total_weight * eps, total_weight / static_cast<double>(column_cuts.size()));
     EXPECT_LE(std::abs(expected_rank - sum_weight), acceptable_error);
   }
 }
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
@@ -45,7 +45,7 @@ TEST(GPUQuantile, Basic) {
   dh::device_vector<bst_idx_t> cuts_ptr(kCols + 1);
   thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
   // Push empty
-  sketch.Push(&ctx, dh::ToSpan(entries), dh::ToSpan(cuts_ptr), dh::ToSpan(cuts_ptr), 0);
+  sketch.Push(&ctx, dh::ToSpan(entries), dh::ToSpan(cuts_ptr), dh::ToSpan(cuts_ptr), 0, 0);
   ASSERT_EQ(sketch.Data().size(), 0);
 }
 
@@ -332,9 +332,9 @@ TEST(GPUQuantile, MergeCategorical) {
   dh::device_vector<size_t> cuts_ptr_1{0, 5, 8};
 
   sketch_0.Push(&ctx, dh::ToSpan(d_entries_0), dh::ToSpan(columns_ptr_0), dh::ToSpan(cuts_ptr_0),
-                entries_0.size(), {});
+                entries_0.size(), 5, {});
   sketch_1.Push(&ctx, dh::ToSpan(d_entries_1), dh::ToSpan(columns_ptr_1), dh::ToSpan(cuts_ptr_1),
-                entries_1.size(), {});
+                entries_1.size(), 5, {});
 
   sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data());
   TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr());
@@ -639,7 +639,7 @@ TEST(GPUQuantile, Push) {
   HostDeviceVector<FeatureType> ft;
   SketchContainer sketch(ft, n_bins, kCols, ctx.Device());
   sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows,
-              {});
+              kRows, {});
 
   auto sketch_data = sketch.Data();
 
@@ -690,7 +690,7 @@ TEST(GPUQuantile, MultiColPush) {
   dh::device_vector<size_t> cuts_ptr(columns_ptr);
 
   sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(cuts_ptr),
-              kRows * kCols, {});
+              kRows * kCols, kRows, {});
 
   auto sketch_data = sketch.Data();
   ASSERT_EQ(sketch_data.size(), kCols * 2);
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
diff --git a/tests/test_distributed/test_with_spark/test_spark.py b/tests/test_distributed/test_with_spark/test_spark.py

Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,11 @@ inline void TestRank(const std::vector<float>& column_cuts, const std::vector<fl`
`111`	`111`	`j++;`
`112`	`112`	`}`
`113`	`113`	`double expected_rank = ((i + 1) * total_weight) / column_cuts.size();`
`114`		`- double acceptable_error = std::max(2.9, total_weight * eps);`
	`114`	`+ // For small sketches, a purely relative tolerance can be tighter than one bin's`
	`115`	`+ // expected mass. Use the larger of the relative tolerance and the average per-cut`
	`116`	`+ // mass instead of a hard-coded floor.`
	`117`	`+ double acceptable_error =`
	`118`	`+ std::max(total_weight * eps, total_weight / static_cast<double>(column_cuts.size()));`
`115`	`119`	`EXPECT_LE(std::abs(expected_rank - sum_weight), acceptable_error);`
`116`	`120`	`}`
`117`	`121`	`}`