|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Soak monitor for the from-genesis mainnet #763 validation run. |
| 3 | +# Emits ONE event line per actionable signal only: |
| 4 | +# - POTS DIVERGENCE ep{N} (reserves/treasury Δ != 0 vs Koios mainnet /totals) |
| 5 | +# - WEDGE #767 (block did not advance for 2 consecutive 60s checks; auto-samples) |
| 6 | +# - ERA {name} (era boundary crossed) |
| 7 | +# - HEARTBEAT (once/hour: epoch, block, era, pots status, blk/s) |
| 8 | +# - SOAK STOPPED (process gone -> exit) |
| 9 | +# Everything else (per-epoch byte-exact OK) is tracked silently and folded into HEARTBEAT. |
| 10 | +set -uo pipefail |
| 11 | + |
| 12 | +METRICS="http://127.0.0.1:12801/metrics" |
| 13 | +NODE_PID=88299 |
| 14 | +LOG=/private/tmp/dugite-mainnet-soak-20260615-154625.log |
| 15 | +REPORTS=/Users/michaelfazio/Source/dugite/reports |
| 16 | + |
| 17 | +era_for() { # epoch -> era name (mainnet) |
| 18 | + local e=$1 |
| 19 | + if [ "$e" -lt 208 ]; then echo Byron |
| 20 | + elif [ "$e" -lt 236 ]; then echo Shelley |
| 21 | + elif [ "$e" -lt 251 ]; then echo Allegra |
| 22 | + elif [ "$e" -lt 290 ]; then echo Mary |
| 23 | + elif [ "$e" -lt 365 ]; then echo Alonzo |
| 24 | + elif [ "$e" -lt 507 ]; then echo Babbage |
| 25 | + else echo Conway; fi |
| 26 | +} |
| 27 | + |
| 28 | +metric() { curl -s --max-time 8 "$METRICS" 2>/dev/null | grep -E "^$1 " | awk '{print $2}'; } |
| 29 | + |
| 30 | +koios_pots() { # epoch -> "reserves treasury" (mainnet ground truth) |
| 31 | + curl -s --max-time 20 "https://api.koios.rest/api/v1/totals?_epoch_no=$1" -H "accept: application/json" 2>/dev/null \ |
| 32 | + | python3 -c "import sys,json;d=json.load(sys.stdin);r=d[0] if d else {};print(r.get('reserves','?'),r.get('treasury','?'))" 2>/dev/null |
| 33 | +} |
| 34 | + |
| 35 | +last_epoch=-1 |
| 36 | +last_block=-1 |
| 37 | +last_era="" |
| 38 | +stall=0 |
| 39 | +pots_status="unknown" |
| 40 | +pots_epoch="?" |
| 41 | +hb_count=0 # heartbeat once every 60 loops (~60 min) |
| 42 | +block_hr_start=-1 # block number at start of the heartbeat hour |
| 43 | + |
| 44 | +while true; do |
| 45 | + ep=$(metric dugite_epoch_number) |
| 46 | + blk=$(metric dugite_block_number) |
| 47 | + |
| 48 | + # ---- liveness / stop detection ---- |
| 49 | + if [ -z "$ep" ] || [ -z "$blk" ]; then |
| 50 | + if ! kill -0 "$NODE_PID" 2>/dev/null; then |
| 51 | + echo "SOAK STOPPED: node pid $NODE_PID gone; last seen ep${last_epoch} block${last_block}" |
| 52 | + exit 0 |
| 53 | + fi |
| 54 | + # metrics briefly unreachable but process alive -> skip this tick quietly |
| 55 | + sleep 60; continue |
| 56 | + fi |
| 57 | + |
| 58 | + [ "$block_hr_start" = "-1" ] && block_hr_start=$blk |
| 59 | + |
| 60 | + # ---- wedge detection ---- |
| 61 | + if [ "$blk" = "$last_block" ]; then |
| 62 | + stall=$((stall+1)) |
| 63 | + if [ "$stall" -ge 2 ]; then |
| 64 | + ts=$(date -u +%Y%m%dT%H%M%SZ) |
| 65 | + out="$REPORTS/767-wedge-sample-$ts.txt" |
| 66 | + sample "$NODE_PID" 5 -mayDie -f "$out" >/dev/null 2>&1 & |
| 67 | + echo "WEDGE #767 at ep${ep} block${blk} slot$(metric dugite_slot_number): block did not advance across 2x60s; sampling -> $out" |
| 68 | + stall=0 # don't re-fire every tick; wait for next genuine 2-stall |
| 69 | + fi |
| 70 | + else |
| 71 | + stall=0 |
| 72 | + fi |
| 73 | + |
| 74 | + # ---- era boundary ---- |
| 75 | + era=$(era_for "$ep") |
| 76 | + if [ -n "$last_era" ] && [ "$era" != "$last_era" ]; then |
| 77 | + echo "ERA $era reached at ep${ep} block${blk} (was $last_era)" |
| 78 | + fi |
| 79 | + last_era=$era |
| 80 | + |
| 81 | + # ---- epoch advance -> pots byte-exactness check ---- |
| 82 | + if [ "$ep" -gt "$last_epoch" ] && [ "$last_epoch" -ge 0 ]; then |
| 83 | + sleep 8 # let boundary pot update settle |
| 84 | + d_res=$(metric dugite_reserves_lovelace) |
| 85 | + d_tre=$(metric dugite_treasury_lovelace) |
| 86 | + read -r k_res k_tre <<<"$(koios_pots "$ep")" |
| 87 | + if [ -n "$d_res" ] && [ -n "$k_res" ] && [ "$k_res" != "?" ]; then |
| 88 | + if [ "$d_res" = "$k_res" ] && [ "$d_tre" = "$k_tre" ]; then |
| 89 | + pots_status="byte-exact"; pots_epoch=$ep |
| 90 | + else |
| 91 | + pots_status="DIVERGED"; pots_epoch=$ep |
| 92 | + dr=$(( ${d_res:-0} - ${k_res:-0} )); dt=$(( ${d_tre:-0} - ${k_tre:-0} )) |
| 93 | + echo "POTS DIVERGENCE ep${ep}: reserves dugite=$d_res koios=$k_res Δ=$dr | treasury dugite=$d_tre koios=$k_tre Δ=$dt" |
| 94 | + fi |
| 95 | + fi |
| 96 | + # Conway close-out signal for #763 |
| 97 | + if [ "$ep" -ge 524 ] && [ "$pots_status" = "byte-exact" ]; then |
| 98 | + echo "POTS #763 byte-exact at ep${ep} (>=524, Conway): reserves=$d_res treasury=$d_tre match Koios" |
| 99 | + fi |
| 100 | + fi |
| 101 | + last_epoch=$ep |
| 102 | + last_block=$blk |
| 103 | + |
| 104 | + # ---- hourly heartbeat ---- |
| 105 | + hb_count=$((hb_count+1)) |
| 106 | + if [ "$hb_count" -ge 60 ]; then |
| 107 | + bps=$(( (blk - block_hr_start) / 3600 )) |
| 108 | + echo "HEARTBEAT ep${ep} $era block${blk} (~${bps} blk/s last hr) pots=${pots_status}@ep${pots_epoch}" |
| 109 | + hb_count=0; block_hr_start=$blk |
| 110 | + fi |
| 111 | + |
| 112 | + sleep 60 |
| 113 | +done |
0 commit comments