client: capture demand RU on onResponseWait throttle-fail

JmPotato · JmPotato · commit d276cd844f40 · 2026-04-13T15:23:02.000+08:00
The new `demand_ru_per_sec` gauge promised "every entry point, never
subtracted on throttle failure", but `onResponseWaitImpl` increments
`mu.demandRUTotal` only after `acquireTokens` succeeds, so a throttle
rejection silently drops the demand sample -- exactly the case the
metric is meant to surface.

Root cause: the increment was co-located with the consumption update
inside the post-acquire lock block, even though demand and consumption
have different lifetimes (demand is monotonic; consumption is rolled
back on rejection). Four call sites carried the same inline expression,
making the wrong placement easy to add and hard to notice.

This commit makes the invariant structural:

* Add `(*groupCostController).recordDemand`, the single point where
  `mu.demandRUTotal` grows. Its doc comment states the rule: callers
  MUST invoke it before any limiter wait/acquire so demand survives a
  rejection.
* Route `onRequestWaitImpl`, `onResponseImpl`, `onResponseWaitImpl`,
  and `addRUConsumption` through `recordDemand`. In
  `onResponseWaitImpl` this also hoists the call above
  `acquireTokens`, fixing the bug.
* Add `TestDemandRUCapturedOnResponseWaitThrottle` to lock the
  invariant in via the throttle-fail path.
* Rewrite the EMA portion of `TestDemandRUTracking`: the previous
  version assigned `gc.run.now` only to have `updateRunState`
  immediately overwrite it with `time.Now()`, so the two-tick EMA
  assertion was a no-op. The new version drives `calcDemandAvg`
  directly with hand-set timestamps and asserts the actual EMA
  trajectory.
* Mirror the `acceleratedReportingPeriod` failpoint into
  `calcDemandAvg` so any test that accelerates `calcAvg` accelerates
  the demand EMA in lockstep.
* `calcDemandAvg` now returns whether it actually updated; the gauge
  Set is gated on that so we never re-publish a stale value when no
  time has elapsed. Drop the `&lt; 0` clamp -- the input counter is
  monotonically increasing, so the EMA cannot go negative.
* Extend the leak-TODO in `cleanUpResourceGroup` to include
  `LowTokenRequestNotifyCounter`, which has the same per-group label
  cardinality as the others on the list.

Signed-off-by: JmPotato &lt;github@ipotato.me&gt;
diff --git a/client/resource_group/controller/global_controller.go b/client/resource_group/controller/global_controller.go
@@ -637,8 +637,9 @@ func (c *ResourceGroupsController) cleanUpResourceGroup() {
 				metrics.DemandRUPerSecGauge.DeleteLabelValues(resourceGroupName)
 				// TODO: clean up the remaining per-group metrics (e.g. TokenConsumedHistogram,
 				// GroupRunningKVRequestCounter, SuccessfulRequestDuration, FailedRequestCounter,
-				// ResourceGroupTokenRequestCounter, RequestRetryCounter, FailedLimitReserveDuration)
-				// which currently leak label series on resource group deletion.
+				// ResourceGroupTokenRequestCounter, RequestRetryCounter, FailedLimitReserveDuration,
+				// LowTokenRequestNotifyCounter) which currently leak label series on resource
+				// group deletion.
 				return true
 			}
 			gc.inactive = true
diff --git a/client/resource_group/controller/group_controller.go b/client/resource_group/controller/group_controller.go
@@ -278,6 +278,23 @@ func (gc *groupCostController) updateRunState() {
 	gc.run.now = newTime
 }
 
+// recordDemand accumulates a delta into the pre-throttling demand counter.
+//
+// Call sites MUST invoke this before any token-bucket wait/acquire so that
+// demand is captured even when the request is ultimately rejected by the
+// limiter; that is the entire reason `demandRUTotal` is tracked separately
+// from `consumption`. Demand is monotonically increasing and is never rolled
+// back on throttle failure.
+func (gc *groupCostController) recordDemand(delta *rmpb.Consumption) {
+	v := getRUValueFromConsumption(delta)
+	if v == 0 {
+		return
+	}
+	gc.mu.Lock()
+	gc.mu.demandRUTotal += v
+	gc.mu.Unlock()
+}
+
 func (gc *groupCostController) updateAvgRequestResourcePerSec() {
 	isBurstable := true
 	counter := gc.run.requestUnitTokens
@@ -287,8 +304,9 @@ func (gc *groupCostController) updateAvgRequestResourcePerSec() {
 	if !gc.calcAvg(counter, getRUValueFromConsumption(gc.run.consumption)) {
 		return
 	}
-	gc.calcDemandAvg(counter, gc.run.demandRUTotal)
-	gc.metrics.demandRUPerSecGauge.Set(counter.avgDemandRUPerSec)
+	if gc.calcDemandAvg(counter, gc.run.demandRUTotal) {
+		gc.metrics.demandRUPerSecGauge.Set(counter.avgDemandRUPerSec)
+	}
 	logControllerTrace("[resource group controller] update avg ru per sec", zap.String("name", gc.name), zap.Float64("avg-ru-per-sec", counter.avgRUPerSec), zap.Float64("avg-demand-ru-per-sec", counter.avgDemandRUPerSec), zap.Bool("is-throttled", gc.isThrottled.Load()))
 	gc.burstable.Store(isBurstable)
 }
@@ -337,18 +355,26 @@ func (gc *groupCostController) calcAvg(counter *tokenCounter, new float64) bool
 	return true
 }
 
-func (gc *groupCostController) calcDemandAvg(counter *tokenCounter, new float64) {
+// calcDemandAvg recomputes the EMA of pre-throttling demanded RU/s.
+//
+// Returns false (and leaves state untouched) when no time has elapsed since
+// the last update so the gauge is not re-Set with a stale reading. Unlike
+// `calcAvg`, no negative-clamp is needed because `demandRUTotal` is
+// monotonically increasing; the EMA of a non-negative-delta sequence cannot
+// itself become negative.
+func (gc *groupCostController) calcDemandAvg(counter *tokenCounter, new float64) bool {
 	deltaDuration := gc.run.now.Sub(counter.avgDemandLastTime)
+	failpoint.Inject("acceleratedReportingPeriod", func() {
+		deltaDuration = 100 * time.Millisecond
+	})
 	if deltaDuration <= 0 {
-		return
+		return false
 	}
 	delta := (new - counter.avgDemandRUPerSecLastRU) / deltaDuration.Seconds()
 	counter.avgDemandRUPerSec = movingAvgFactor*counter.avgDemandRUPerSec + (1-movingAvgFactor)*delta
-	if counter.avgDemandRUPerSec < 0 {
-		counter.avgDemandRUPerSec = 0
-	}
 	counter.avgDemandLastTime = gc.run.now
 	counter.avgDemandRUPerSecLastRU = new
+	return true
 }
 
 func (gc *groupCostController) shouldReportConsumption() bool {
@@ -584,9 +610,12 @@ func (gc *groupCostController) onRequestWaitImpl(
 		calc.BeforeKVRequest(delta, info)
 	}
 
+	// Record pre-throttling demand before any limiter interaction so a
+	// subsequent rollback only unwinds consumption, not demand.
+	gc.recordDemand(delta)
+
 	gc.mu.Lock()
 	add(gc.mu.consumption, delta)
-	gc.mu.demandRUTotal += getRUValueFromConsumption(delta)
 	gc.mu.Unlock()
 
 	if !gc.burstable.Load() {
@@ -634,6 +663,13 @@ func (gc *groupCostController) onResponseImpl(
 	for _, calc := range gc.calculators {
 		calc.AfterKVRequest(delta, req, resp)
 	}
+
+	// Record pre-throttling demand. `onResponseImpl` does not block on token
+	// acquisition, so this could equivalently sit inside the lock block below;
+	// keeping it here makes the demand-before-limiter invariant uniform across
+	// all entry points.
+	gc.recordDemand(delta)
+
 	if !gc.burstable.Load() {
 		counter := gc.run.requestUnitTokens
 		if v := getRUValueFromConsumption(delta); v > 0 {
@@ -644,8 +680,6 @@ func (gc *groupCostController) onResponseImpl(
 	gc.mu.Lock()
 	// Record the consumption of the request
 	add(gc.mu.consumption, delta)
-	// Record the response-phase demand as well (actual read bytes, CPU, etc.)
-	gc.mu.demandRUTotal += getRUValueFromConsumption(delta)
 	// Record the consumption of the request by store
 	count := &rmpb.Consumption{}
 	*count = *delta
@@ -667,6 +701,13 @@ func (gc *groupCostController) onResponseWaitImpl(
 	for _, calc := range gc.calculators {
 		calc.AfterKVRequest(delta, req, resp)
 	}
+
+	// Record pre-throttling demand BEFORE acquireTokens so it is captured
+	// even when the response is rejected by the limiter. Without this hoist
+	// the demand counter would silently miss exactly the throttled responses
+	// the metric is supposed to surface.
+	gc.recordDemand(delta)
+
 	var waitDuration time.Duration
 	if !gc.burstable.Load() {
 		allowDebt := delta.ReadBytes+delta.WriteBytes < bigRequestThreshold || !gc.isThrottled.Load()
@@ -687,7 +728,6 @@ func (gc *groupCostController) onResponseWaitImpl(
 	gc.mu.Lock()
 	// Record the consumption of the request
 	add(gc.mu.consumption, delta)
-	gc.mu.demandRUTotal += getRUValueFromConsumption(delta)
 	// Record the consumption of the request by store
 	count := &rmpb.Consumption{}
 	*count = *delta
@@ -703,9 +743,9 @@ func (gc *groupCostController) onResponseWaitImpl(
 }
 
 func (gc *groupCostController) addRUConsumption(consumption *rmpb.Consumption) {
+	gc.recordDemand(consumption)
 	gc.mu.Lock()
 	add(gc.mu.consumption, consumption)
-	gc.mu.demandRUTotal += getRUValueFromConsumption(consumption)
 	gc.mu.Unlock()
 }
 
diff --git a/client/resource_group/controller/group_controller_test.go b/client/resource_group/controller/group_controller_test.go
@@ -345,7 +345,7 @@ func TestDemandRUTracking(t *testing.T) {
 	gc.mu.Unlock()
 	re.Positive(demandTotal, "demand should be accumulated after requests")
 
-	// Now issue a request that gets throttled (rejected).
+	// Now issue a request that gets throttled (rejected) on `onRequestWaitImpl`.
 	bigReq := &TestRequestInfo{
 		isWrite:    true,
 		writeBytes: 10000000,
@@ -361,18 +361,82 @@ func TestDemandRUTracking(t *testing.T) {
 	re.Greater(demandAfterThrottle, demandTotal,
 		"demand should increase even for throttled requests")
 
-	// Verify that the demand EMA is computed correctly.
-	now := time.Now()
-	gc.run.now = now
-	gc.updateRunState()
-	gc.updateAvgRequestResourcePerSec()
+	// Verify the demand EMA math directly. We deliberately avoid going through
+	// `updateRunState` here because that method overwrites `gc.run.now` with
+	// `time.Now()` on every call, which makes any caller-side time control a
+	// no-op. Instead, snapshot demand into `gc.run` once and drive `calcDemandAvg`
+	// with hand-set timestamps so the EMA's behavior is observable.
+	gc.updateRunState() // copy mu.demandRUTotal into gc.run.demandRUTotal once.
 
-	// Advance time and update again so the EMA has two data points.
-	gc.run.now = now.Add(time.Second)
-	gc.updateRunState()
-	gc.updateAvgRequestResourcePerSec()
+	counter := gc.run.requestUnitTokens
+	// Reset the EMA bookkeeping so we can measure a clean two-tick trajectory.
+	counter.avgDemandRUPerSec = 0
+	counter.avgDemandRUPerSecLastRU = 0
+	base := time.Unix(0, 0)
+	counter.avgDemandLastTime = base
+	gc.run.now = base.Add(time.Second)
+
+	re.True(gc.calcDemandAvg(counter, gc.run.demandRUTotal))
+	// First tick: avg = movingAvgFactor*0 + (1-movingAvgFactor) * (demandTotal/1s).
+	expectedFirst := (1 - movingAvgFactor) * gc.run.demandRUTotal
+	re.InEpsilon(expectedFirst, counter.avgDemandRUPerSec, 1e-9,
+		"first EMA tick should equal (1-movingAvgFactor) * demand-rate")
+
+	// Second tick: same demand snapshot, one more second elapsed -> rate is 0,
+	// so the EMA must decay toward zero by movingAvgFactor.
+	gc.run.now = base.Add(2 * time.Second)
+	prev := counter.avgDemandRUPerSec
+	re.True(gc.calcDemandAvg(counter, gc.run.demandRUTotal))
+	re.InEpsilon(movingAvgFactor*prev, counter.avgDemandRUPerSec, 1e-9,
+		"with no new demand the EMA should decay by movingAvgFactor")
+
+	// Same `gc.run.now` -> calcDemandAvg must report no update and leave state alone.
+	re.False(gc.calcDemandAvg(counter, gc.run.demandRUTotal))
+}
 
+// TestDemandRUCapturedOnResponseWaitThrottle locks in the invariant that
+// `demand_ru_per_sec` reflects rejected responses too. Without the
+// `recordDemand` hoist in `onResponseWaitImpl`, throttle-rejected responses
+// would be silently absent from the demand counter -- defeating the metric.
+func TestDemandRUCapturedOnResponseWaitThrottle(t *testing.T) {
+	re := require.New(t)
+	gc := createTestGroupCostController(re)
+	// Short retry budget so the test fails fast.
+	gc.mainCfg.WaitRetryInterval = 5 * time.Millisecond
+	gc.mainCfg.WaitRetryTimes = 2
+	gc.mainCfg.LTBMaxWaitDuration = 10 * time.Millisecond
+
+	// Stop the bucket from refilling. The limiter still carries its initial
+	// tokens (FillRate=1000 -> 1000 RU seeded in initRunState), so the request
+	// below must demand strictly more than that to provoke a throttle error.
 	counter := gc.run.requestUnitTokens
-	re.GreaterOrEqual(counter.avgDemandRUPerSec, 0.0,
-		"demand EMA should be non-negative")
+	counter.limiter.Reconfigure(time.Now(), tokenBucketReconfigureArgs{
+		newTokens:   0,
+		newFillRate: 0,
+		newBurst:    0,
+	})
+	// `allowDebt` in `onResponseWaitImpl` is false only when the response is
+	// "big" (read+write bytes >= bigRequestThreshold) AND the group is already
+	// throttled. Force both.
+	gc.isThrottled.Store(true)
+
+	gc.mu.Lock()
+	demandBefore := gc.mu.demandRUTotal
+	gc.mu.Unlock()
+
+	const readBytes = 128 * 1024 * 1024 // 128 MiB -> 2048 RRU at default 1/64KiB cost
+	req := &TestRequestInfo{isWrite: false}
+	resp := &TestResponseInfo{
+		readBytes: readBytes,
+		succeed:   true,
+	}
+	_, _, err := gc.onResponseWaitImpl(context.TODO(), req, resp)
+	re.Error(err)
+	re.True(errs.ErrClientResourceGroupThrottled.Equal(err))
+
+	gc.mu.Lock()
+	demandAfter := gc.mu.demandRUTotal
+	gc.mu.Unlock()
+	re.Greater(demandAfter, demandBefore,
+		"demand should be recorded for responses rejected by the limiter")
 }