Skip to content

Commit 9b3a109

Browse files
committed
fix(metrics): keep pots gauges live at tip (treasury/reserves froze one epoch stale)
dugite_treasury_lovelace / dugite_reserves_lovelace are refreshed from ls.epochs only on the bulk-sync catch-up branch of apply_fetched_block and on the forge path. At tip, received blocks take the `at_tip` branch which calls publish_ledger_view (which does NOT touch the pots atomics), so once the node reaches tip the gauges freeze at the last catch-up value and never reflect the reserves->treasury reward-update transfer applied at an epoch boundary the node did not forge itself. Observed on preview epoch 1332->1333: the gauges read the exact epoch-1332 pots while the node was correctly in epoch 1333 — nearly triggering a false reward-divergence investigation. The ledger itself is byte-exact: the epoch-1333 snapshot has treasury=6820324388335672 reserves=7916699786696095, matching Koios epoch 1333 to the lovelace. This is purely a metric-staleness bug, but a dangerous one because it misleads epoch-boundary cross-checks. Fix: add NodeMetrics::set_pots() (two O(1) atomic stores) and call it from post_block_apply_updates — the per-block refresh that already reads live ls.epoch under a fresh read-lock. This runs on every apply path (tip, catch-up, forge, replay), so the pots gauge is now as live as the epoch gauge. TDD: metrics::tests::test_set_pots_updates_gauges (red->green). Also adds a DUGITE_DUMP_LEDGER_PROBE diagnostic to apply_bench that prints pots + reward-update inputs (bprev_blocks_by_pool, go snapshot, ss_fee, prev_d) straight from a snapshot — the instrument that distinguished this metric-lag from a real ledger divergence.
1 parent a774ac8 commit 9b3a109

3 files changed

Lines changed: 103 additions & 1 deletion

File tree

crates/dugite-node/src/bin/apply_bench.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,51 @@ fn main() {
381381
ledger.utxo.utxo_set.len()
382382
);
383383

384+
// Diagnostic probe (issue: epoch-boundary reward-pots divergence).
385+
// When DUGITE_DUMP_LEDGER_PROBE is set, print the reward-update inputs and
386+
// pots straight from the loaded snapshot, then exit — used to confirm
387+
// whether `bprev_blocks_by_pool` (eta numerator) is empty at the boundary.
388+
if std::env::var("DUGITE_DUMP_LEDGER_PROBE").is_ok() {
389+
let e = &ledger.epochs;
390+
let bprev_sum: u64 = e.snapshots.bprev_blocks_by_pool.values().sum();
391+
eprintln!("[probe] epoch={}", ledger.epoch.0);
392+
eprintln!("[probe] treasury={}", e.treasury.0);
393+
eprintln!("[probe] reserves={}", e.reserves.0);
394+
eprintln!(
395+
"[probe] bprev_block_count={} bprev_blocks_by_pool.len={} bprev_sum={}",
396+
e.snapshots.bprev_block_count,
397+
e.snapshots.bprev_blocks_by_pool.len(),
398+
bprev_sum
399+
);
400+
eprintln!("[probe] ss_fee={}", e.snapshots.ss_fee.0);
401+
match e.snapshots.go.as_ref() {
402+
Some(g) => eprintln!(
403+
"[probe] go: epoch={} pools={} stake_creds={} go_block_count={} go_blocks_by_pool.len={}",
404+
g.epoch.0,
405+
g.pool_stake.len(),
406+
g.stake_distribution.len(),
407+
g.epoch_block_count,
408+
g.epoch_blocks_by_pool.len()
409+
),
410+
None => eprintln!("[probe] go: NONE"),
411+
}
412+
eprintln!(
413+
"[probe] set_present={} mark_present={}",
414+
e.snapshots.set.is_some(),
415+
e.snapshots.mark.is_some()
416+
);
417+
eprintln!(
418+
"[probe] prev_d={}/{} prev_pv_major={}",
419+
e.prev_d.numerator, e.prev_d.denominator, e.prev_protocol_version_major
420+
);
421+
eprintln!(
422+
"[probe] reward_accounts={} rupd_addrs_rew_present={}",
423+
ledger.certs.reward_accounts.len(),
424+
e.rupd_addrs_rew.is_some()
425+
);
426+
std::process::exit(0);
427+
}
428+
384429
// Note: We deliberately do NOT restore the LSM UTxO store from disk.
385430
// The in-memory UtxoStore populated from the snapshot is sufficient for
386431
// measuring the apply-path bottleneck (CBOR decode + validation + UTxO ops).

crates/dugite-node/src/metrics.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1452,6 +1452,22 @@ impl NodeMetrics {
14521452
.store(u64::from(peer_sharing), Ordering::Relaxed);
14531453
}
14541454

1455+
/// Refresh just the pots gauges (`dugite_treasury_lovelace` /
1456+
/// `dugite_reserves_lovelace`) from the live ledger state.
1457+
///
1458+
/// Two O(1) atomic stores — cheap enough to run on the per-block apply
1459+
/// path (`post_block_apply_updates`) so the gauges stay live at tip, not
1460+
/// only during bulk-sync catch-up. Without this, the at-tip apply path
1461+
/// (which calls `publish_ledger_view`, not `set_governance_snapshot`)
1462+
/// left the pots frozen at the last catch-up value, reading one epoch
1463+
/// stale across every boundary the node did not forge itself.
1464+
pub fn set_pots(&self, treasury_lovelace: u64, reserves_lovelace: u64) {
1465+
self.treasury_lovelace
1466+
.store(treasury_lovelace, Ordering::Relaxed);
1467+
self.reserves_lovelace
1468+
.store(reserves_lovelace, Ordering::Relaxed);
1469+
}
1470+
14551471
/// Update every governance-related gauge from a flattened snapshot.
14561472
///
14571473
/// Called from the node's startup init path (`run`) and the sync loop's
@@ -3375,6 +3391,39 @@ mod tests {
33753391
assert!(output.contains("dugite_drep_active 7500\n"));
33763392
}
33773393

3394+
// Regression: the per-block metric refresh (`post_block_apply_updates`)
3395+
// must keep the pots gauges live at tip, not only during bulk-sync
3396+
// catch-up. Before the fix, the at-tip apply path called
3397+
// `publish_ledger_view` (which does NOT touch the pots atomics) so
3398+
// `dugite_treasury_lovelace` / `dugite_reserves_lovelace` froze at the
3399+
// last catch-up value and never reflected an epoch-boundary reserves→
3400+
// treasury transfer — making the gauges read one epoch stale at tip and
3401+
// nearly triggering a false reward-divergence investigation on preview
3402+
// epoch 1332→1333. `set_pots` is the O(1) seam the per-block path uses.
3403+
#[test]
3404+
fn test_set_pots_updates_gauges() {
3405+
let metrics = NodeMetrics::new();
3406+
assert_eq!(metrics.treasury_lovelace.load(Ordering::Relaxed), 0);
3407+
assert_eq!(metrics.reserves_lovelace.load(Ordering::Relaxed), 0);
3408+
3409+
// Preview epoch-1333 pots (byte-exact vs Koios) — the values the
3410+
// gauge must show once the boundary block is applied at tip.
3411+
metrics.set_pots(6_820_324_388_335_672, 7_916_699_786_696_095);
3412+
3413+
assert_eq!(
3414+
metrics.treasury_lovelace.load(Ordering::Relaxed),
3415+
6_820_324_388_335_672
3416+
);
3417+
assert_eq!(
3418+
metrics.reserves_lovelace.load(Ordering::Relaxed),
3419+
7_916_699_786_696_095
3420+
);
3421+
3422+
let output = metrics.to_prometheus();
3423+
assert!(output.contains("dugite_treasury_lovelace 6820324388335672\n"));
3424+
assert!(output.contains("dugite_reserves_lovelace 7916699786696095\n"));
3425+
}
3426+
33783427
// D5: dugite_drep_count HELP text must document the active+inactive
33793428
// semantics and point to Koios for DReps-with-delegated-stake.
33803429
#[test]

crates/dugite-node/src/node/mod.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6998,16 +6998,24 @@ impl Node {
69986998
.slot_to_wallclock_ms(block_slot.0, &view.slot_config)
69996999
.await;
70007000
self.metrics.set_tip_slot_time_ms(slot_time_ms);
7001-
let (live_epoch, pv_major, pv_minor) = {
7001+
let (live_epoch, pv_major, pv_minor, treasury, reserves) = {
70027002
let ls = self.ledger_state.read().await;
70037003
(
70047004
ls.epoch.0,
70057005
ls.epochs.protocol_params.protocol_version_major,
70067006
ls.epochs.protocol_params.protocol_version_minor,
7007+
ls.epochs.treasury.0,
7008+
ls.epochs.reserves.0,
70077009
)
70087010
};
70097011
self.metrics.set_epoch(live_epoch);
70107012
self.metrics.set_protocol_version(pv_major, pv_minor);
7013+
// Keep the pots gauges as live as the epoch gauge. This per-block
7014+
// path runs at tip too (unlike the catch-up-only inline store in
7015+
// `apply_fetched_block`), so the reserves→treasury transfer applied
7016+
// at an epoch boundary is reflected immediately even when the node
7017+
// did not forge the boundary block. Cheap (two atomic stores).
7018+
self.metrics.set_pots(treasury, reserves);
70117019
}
70127020
// Authoritative era from the applied block's HFC era tag — NOT from the
70137021
// ledger protocol-version major (which is Shelley-shaped and reads 2

0 commit comments

Comments
 (0)