Clarify that all workspace resources are actually counted independently despite the hierarchy

achirkin · achirkin · commit 69543a18f330 · 2026-04-02T02:23:07.000-07:00
diff --git a/cpp/include/raft/util/dry_run_memory_resource.hpp b/cpp/include/raft/util/dry_run_memory_resource.hpp
@@ -164,7 +164,20 @@ class dry_run_resources : public resources {
 
   void init()
   {
-    // Force-initialize all affected resources (lazy creation).
+    // Independent-counting invariant
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // 1. Force-initialize all lazily-created resources (workspace, large workspace,
+    //    pinned, managed) so that their factories resolve against the *original*
+    //    global device MR, not a tracking wrapper we install later.
+    // 2. Capture every upstream ref while it still points to the original resource.
+    // 3. Snapshot the resource map to keep the originals alive.
+    // 4. Only *then* replace the global device resource with the tracking bridge.
+    // 5. Wrap each captured upstream with a separate dry_run_resource adaptor.
+    //
+    // Because step 2 happens before step 4, workspace/lws allocations flow through
+    // their own adaptor directly to old_device_mr_, bypassing the device bridge.
+    // Each allocation is therefore counted in exactly one category, and
+    // memory_stats::total() returns an accurate, non-overlapping sum.
     auto* ws         = resource::get_workspace_resource(*this);
     auto ws_free     = resource::get_workspace_free_bytes(*this);
     auto ws_upstream = ws->get_upstream_resource();
diff --git a/cpp/include/raft/util/memory_stats_resources.hpp b/cpp/include/raft/util/memory_stats_resources.hpp
@@ -43,11 +43,15 @@ struct memory_stats {
   std::size_t host_pinned{0};
 
   /**
-   * @brief Plain sum of all memory stats.
+   * @brief Sum of all memory stats across the six tracked categories.
    *
-   * Note, this does not take into account the resource hierarchy.
-   * For example, it's common that workspace resources are allocated from the device global
-   * resource, so they are effectively counted twice in this function.
+   * The three resource wrapper classes (dry_run_resources, memory_stats_resources,
+   * memory_tracking_resources) guarantee that every category is tracked by its own
+   * independent adaptor: each wrapper force-initializes all resources, captures their
+   * upstream refs *before* replacing the global device resource, and wraps those
+   * originals.  Workspace and large-workspace allocations therefore bypass the
+   * device-global tracking adaptor and are counted exactly once, making this sum
+   * an accurate total when used with stats produced by any of the three wrappers.
    */
   [[nodiscard]] inline constexpr auto total() const -> std::size_t
   {
@@ -193,6 +197,20 @@ class memory_stats_resources : public resources {
 
   void init()
   {
+    // Independent-counting invariant
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // 1. Force-initialize all lazily-created resources (workspace, large workspace,
+    //    pinned, managed) so that their factories resolve against the *original*
+    //    global device MR, not a tracking wrapper we install later.
+    // 2. Capture every upstream ref while it still points to the original resource.
+    // 3. Snapshot the resource map to keep the originals alive.
+    // 4. Only *then* replace the global device resource with the tracking bridge.
+    // 5. Wrap each captured upstream with a separate statistics_adaptor.
+    //
+    // Because step 2 happens before step 4, workspace/lws allocations flow through
+    // their own adaptor directly to old_device_mr_, bypassing the device bridge.
+    // Each allocation is therefore counted in exactly one category, and
+    // memory_stats::total() returns an accurate, non-overlapping sum.
     auto* ws         = resource::get_workspace_resource(*this);
     auto ws_free     = resource::get_workspace_free_bytes(*this);
     auto ws_upstream = ws->get_upstream_resource();
diff --git a/cpp/include/raft/util/memory_tracking_resources.hpp b/cpp/include/raft/util/memory_tracking_resources.hpp
@@ -190,6 +190,20 @@ class memory_tracking_resources : public resources {
 
   void init()
   {
+    // Independent-counting invariant
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // 1. Force-initialize all lazily-created resources (workspace, large workspace,
+    //    pinned, managed) so that their factories resolve against the *original*
+    //    global device MR, not a tracking wrapper we install later.
+    // 2. Capture every upstream ref while it still points to the original resource.
+    // 3. Snapshot the resource map to keep the originals alive.
+    // 4. Only *then* replace the global device resource with the tracking bridge.
+    // 5. Wrap each captured upstream with a separate statistics/notifying adaptor.
+    //
+    // Because step 2 happens before step 4, workspace/lws allocations flow through
+    // their own adaptor directly to old_device_mr_, bypassing the device bridge.
+    // Each allocation is therefore counted in exactly one category, and
+    // memory_stats::total() returns an accurate, non-overlapping sum.
     auto* ws          = raft::resource::get_workspace_resource(*this);
     auto ws_free      = raft::resource::get_workspace_free_bytes(*this);
     auto upstream_ref = ws->get_upstream_resource();
diff --git a/cpp/tests/test_utils.cuh b/cpp/tests/test_utils.cuh
@@ -362,9 +362,8 @@ void execute_with_dry_run_check(raft::resources const& res,
   resource::sync_stream(stat_res);
   auto actual = stat_res.get_bytes_peak();
 
-  auto total_dry = dry.device_global + dry.device_managed + dry.host + dry.host_pinned;
-  auto total_actual =
-    actual.device_global + actual.device_managed + actual.host + actual.host_pinned;
+  auto total_dry    = dry.total();
+  auto total_actual = actual.total();
 
   if (dry.device_workspace != actual.device_workspace ||
       dry.device_large_workspace != actual.device_large_workspace ||
diff --git a/cpp/tests/util/dry_run_memory_resource.cpp b/cpp/tests/util/dry_run_memory_resource.cpp
@@ -232,4 +232,159 @@ TEST(DryRunExecute, ExceptionSafety)
   EXPECT_FALSE(resource::get_dry_run_flag(res));
 }
 
+// ===== Independent-counting tests for dry_run_resources =====
+
+TEST(DryRunResources, IndependentCounting_DefaultWorkspace)
+{
+  raft::resources res;
+
+  dry_run_resources dry_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto* ws_mr  = resource::get_workspace_resource(dry_res);
+  void* ws_ptr = ws_mr->allocate(rmm::cuda_stream_view{}, kWsSize);
+
+  auto* dev_mr  = rmm::mr::get_current_device_resource();
+  void* dev_ptr = dev_mr->allocate(rmm::cuda_stream_view{}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_mr->deallocate(rmm::cuda_stream_view{}, ws_ptr, kWsSize);
+  dev_mr->deallocate(rmm::cuda_stream_view{}, dev_ptr, kGlobalSize);
+}
+
+TEST(DryRunResources, IndependentCounting_WorkspaceSetToGlobal)
+{
+  raft::resources res;
+  resource::set_workspace_to_global_resource(res);
+
+  dry_run_resources dry_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto* ws_mr  = resource::get_workspace_resource(dry_res);
+  void* ws_ptr = ws_mr->allocate(rmm::cuda_stream_view{}, kWsSize);
+
+  auto* dev_mr  = rmm::mr::get_current_device_resource();
+  void* dev_ptr = dev_mr->allocate(rmm::cuda_stream_view{}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_mr->deallocate(rmm::cuda_stream_view{}, ws_ptr, kWsSize);
+  dev_mr->deallocate(rmm::cuda_stream_view{}, dev_ptr, kGlobalSize);
+}
+
+// ===== Independent-counting tests for memory_stats_resources =====
+
+TEST(MemoryStatsResources, IndependentCounting_DefaultWorkspace)
+{
+  raft::resources res;
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto* ws_mr  = resource::get_workspace_resource(stat_res);
+  void* ws_ptr = ws_mr->allocate(rmm::cuda_stream_view{}, kWsSize);
+
+  auto* dev_mr  = rmm::mr::get_current_device_resource();
+  void* dev_ptr = dev_mr->allocate(rmm::cuda_stream_view{}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_mr->deallocate(rmm::cuda_stream_view{}, ws_ptr, kWsSize);
+  dev_mr->deallocate(rmm::cuda_stream_view{}, dev_ptr, kGlobalSize);
+}
+
+TEST(MemoryStatsResources, IndependentCounting_WorkspaceSetToGlobal)
+{
+  raft::resources res;
+  resource::set_workspace_to_global_resource(res);
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto* ws_mr  = resource::get_workspace_resource(stat_res);
+  void* ws_ptr = ws_mr->allocate(rmm::cuda_stream_view{}, kWsSize);
+
+  auto* dev_mr  = rmm::mr::get_current_device_resource();
+  void* dev_ptr = dev_mr->allocate(rmm::cuda_stream_view{}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_mr->deallocate(rmm::cuda_stream_view{}, ws_ptr, kWsSize);
+  dev_mr->deallocate(rmm::cuda_stream_view{}, dev_ptr, kGlobalSize);
+}
+
+TEST(MemoryStatsResources, IndependentCounting_PoolWorkspace)
+{
+  raft::resources res;
+  constexpr std::size_t kPoolLimit = 64UL * 1024UL * 1024UL;
+  resource::set_workspace_to_pool_resource(res, kPoolLimit);
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto* ws_mr  = resource::get_workspace_resource(stat_res);
+  void* ws_ptr = ws_mr->allocate(rmm::cuda_stream_view{}, kWsSize);
+
+  auto* dev_mr  = rmm::mr::get_current_device_resource();
+  void* dev_ptr = dev_mr->allocate(rmm::cuda_stream_view{}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_mr->deallocate(rmm::cuda_stream_view{}, ws_ptr, kWsSize);
+  dev_mr->deallocate(rmm::cuda_stream_view{}, dev_ptr, kGlobalSize);
+}
+
+// ===== Nested wrappers test =====
+
+TEST(IndependentCounting, NestedDryRunInStats)
+{
+  raft::resources res;
+
+  memory_stats_resources stat_res(res);
+  dry_run_resources dry_res(stat_res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto* ws_mr  = resource::get_workspace_resource(dry_res);
+  void* ws_ptr = ws_mr->allocate(rmm::cuda_stream_view{}, kWsSize);
+
+  auto* dev_mr  = rmm::mr::get_current_device_resource();
+  void* dev_ptr = dev_mr->allocate(rmm::cuda_stream_view{}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_mr->deallocate(rmm::cuda_stream_view{}, ws_ptr, kWsSize);
+  dev_mr->deallocate(rmm::cuda_stream_view{}, dev_ptr, kGlobalSize);
+}
+
 }  // namespace raft::util