Skip to content

Commit 882647e

Browse files
Saba9claudegradio-pr-botabidlabs
authored
Add multi-GPU system metrics support (#481)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com> Co-authored-by: Abubakar Abid <abubakar@huggingface.co> Co-authored-by: Abubakar Abid <islamrealm@gmail.com>
1 parent 06ea885 commit 882647e

16 files changed

Lines changed: 3215 additions & 78 deletions

.changeset/forty-pigs-beg.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trackio": minor
3+
---
4+
5+
feat:Add multi-GPU system metrics support
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
"""
2+
hf-jobs-multi-gpu-system-metrics.py
3+
===================================
4+
5+
Minimal example that exercises Trackio's multi-GPU system metrics path on a
6+
single multi-GPU machine. The script uploads only this file with
7+
`hf jobs uv run`, then relaunches itself under `torch.distributed.run` so each
8+
GPU gets its own worker. Rank 0 initializes Trackio with `auto_log_gpu=True`,
9+
which should record per-GPU system metrics for every visible GPU.
10+
11+
Run locally from this repo:
12+
13+
python examples/hf-jobs-multi-gpu-system-metrics.py \
14+
--project local-multi-gpu-demo
15+
16+
Run on HF Jobs with the released package:
17+
18+
hf jobs uv run \
19+
--flavor l4x4 \
20+
--timeout 20m \
21+
--secrets HF_TOKEN \
22+
--with torch \
23+
--with "trackio[gpu]" \
24+
examples/hf-jobs-multi-gpu-system-metrics.py \
25+
--project hf-jobs-multi-gpu-demo \
26+
--space-id <username>/<space-name>
27+
28+
Run on HF Jobs against this PR branch before release:
29+
30+
hf jobs uv run \
31+
--flavor l4x4 \
32+
--timeout 20m \
33+
--secrets HF_TOKEN \
34+
--with torch \
35+
--with "trackio @ git+https://github.com/gradio-app/trackio.git@saba/multi-gpu" \
36+
--with nvidia-ml-py \
37+
--with psutil \
38+
examples/hf-jobs-multi-gpu-system-metrics.py \
39+
--project hf-jobs-multi-gpu-demo \
40+
--space-id <username>/<space-name>
41+
42+
After the job starts, open:
43+
44+
https://huggingface.co/spaces/<username>/<space-name>
45+
46+
Then go to the run's System Metrics page and confirm that metrics such as
47+
`utilization`, `allocated_memory`, `power`, and `temp` are present for multiple
48+
GPUs on the same run.
49+
"""
50+
51+
from __future__ import annotations
52+
53+
import argparse
54+
import os
55+
import socket
56+
import subprocess
57+
import sys
58+
import time
59+
from datetime import datetime
60+
from pathlib import Path
61+
62+
import torch
63+
import torch.distributed as dist
64+
65+
import trackio
66+
67+
68+
def parse_args() -> argparse.Namespace:
69+
timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
70+
parser = argparse.ArgumentParser()
71+
parser.add_argument("--project", default=f"hf-jobs-multi-gpu-{timestamp}")
72+
parser.add_argument("--run-name", default=f"distributed-smoke-{timestamp}")
73+
parser.add_argument("--space-id", default=None)
74+
parser.add_argument("--steps", type=int, default=12)
75+
parser.add_argument("--matrix-size", type=int, default=2048)
76+
parser.add_argument("--matmul-repeats", type=int, default=6)
77+
parser.add_argument("--sleep-seconds", type=float, default=0.5)
78+
parser.add_argument("--gpu-log-interval", type=float, default=1.0)
79+
parser.add_argument("--nproc-per-node", type=int, default=None)
80+
parser.add_argument("--no-launch", action="store_true")
81+
return parser.parse_args()
82+
83+
84+
def maybe_relaunch_distributed(args: argparse.Namespace) -> None:
85+
if args.no_launch or "RANK" in os.environ:
86+
return
87+
88+
if not torch.cuda.is_available():
89+
print("CUDA is not available, running a single-process fallback.", flush=True)
90+
return
91+
92+
detected = torch.cuda.device_count()
93+
nproc_per_node = args.nproc_per_node or detected
94+
if nproc_per_node <= 1:
95+
print("Only one GPU detected, running a single-process fallback.", flush=True)
96+
return
97+
98+
script_path = str(Path(__file__).resolve())
99+
cmd = [
100+
sys.executable,
101+
"-m",
102+
"torch.distributed.run",
103+
"--standalone",
104+
"--nnodes=1",
105+
f"--nproc-per-node={nproc_per_node}",
106+
script_path,
107+
"--no-launch",
108+
*sys.argv[1:],
109+
]
110+
print("Launching distributed workers:", " ".join(cmd), flush=True)
111+
subprocess.run(cmd, check=True)
112+
raise SystemExit(0)
113+
114+
115+
def init_distributed() -> tuple[int, int, int, torch.device]:
116+
world_size = int(os.environ.get("WORLD_SIZE", "1"))
117+
rank = int(os.environ.get("RANK", "0"))
118+
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
119+
120+
if torch.cuda.is_available():
121+
torch.cuda.set_device(local_rank)
122+
device = torch.device("cuda", local_rank)
123+
else:
124+
device = torch.device("cpu")
125+
126+
if world_size > 1 and not dist.is_initialized():
127+
backend = "nccl" if device.type == "cuda" else "gloo"
128+
dist.init_process_group(backend=backend)
129+
130+
return rank, local_rank, world_size, device
131+
132+
133+
def cleanup_distributed() -> None:
134+
if dist.is_initialized():
135+
dist.barrier()
136+
dist.destroy_process_group()
137+
138+
139+
def average_across_workers(value: torch.Tensor, world_size: int) -> float:
140+
reduced = value.detach().clone()
141+
if world_size > 1:
142+
dist.all_reduce(reduced, op=dist.ReduceOp.SUM)
143+
reduced /= world_size
144+
return float(reduced.item())
145+
146+
147+
def run_workload(args: argparse.Namespace) -> None:
148+
maybe_relaunch_distributed(args)
149+
rank, _, world_size, device = init_distributed()
150+
151+
if device.type == "cuda":
152+
dtype = torch.float16
153+
host = socket.gethostname()
154+
print(
155+
f"[rank {rank}] device={torch.cuda.get_device_name(device)} "
156+
f"host={host} world_size={world_size}",
157+
flush=True,
158+
)
159+
else:
160+
dtype = torch.float32
161+
print(f"[rank {rank}] running on CPU", flush=True)
162+
163+
run = None
164+
if rank == 0:
165+
config = {
166+
"world_size": world_size,
167+
"matrix_size": args.matrix_size,
168+
"matmul_repeats": args.matmul_repeats,
169+
"steps": args.steps,
170+
"sleep_seconds": args.sleep_seconds,
171+
"gpu_log_interval": args.gpu_log_interval,
172+
}
173+
run = trackio.init(
174+
project=args.project,
175+
name=args.run_name,
176+
config=config,
177+
space_id=args.space_id,
178+
auto_log_gpu=True,
179+
gpu_log_interval=args.gpu_log_interval,
180+
)
181+
if args.space_id:
182+
print(
183+
f"DASHBOARD_URL=https://huggingface.co/spaces/{args.space_id}",
184+
flush=True,
185+
)
186+
187+
left = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=dtype)
188+
right = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=dtype)
189+
190+
for step in range(args.steps):
191+
start = time.perf_counter()
192+
work = left
193+
for _ in range(args.matmul_repeats):
194+
work = work @ right
195+
loss = work.float().pow(2).mean().sqrt()
196+
197+
if device.type == "cuda":
198+
torch.cuda.synchronize(device)
199+
200+
step_time = time.perf_counter() - start
201+
mean_loss = average_across_workers(loss, world_size)
202+
mean_step_time = average_across_workers(
203+
torch.tensor(step_time, device=device, dtype=torch.float32),
204+
world_size,
205+
)
206+
207+
if rank == 0 and run is not None:
208+
total_flops = 2 * args.matmul_repeats * (args.matrix_size**3) * world_size
209+
tokens_per_second = (
210+
args.matrix_size
211+
* args.matrix_size
212+
* world_size
213+
/ max(mean_step_time, 1e-6)
214+
)
215+
trackio.log(
216+
{
217+
"train/rmse": mean_loss,
218+
"train/step_time_seconds": mean_step_time,
219+
"train/tokens_per_second": tokens_per_second,
220+
"train/approx_tflops": total_flops
221+
/ max(mean_step_time, 1e-6)
222+
/ 1e12,
223+
},
224+
step=step,
225+
)
226+
print(
227+
f"[rank 0] step={step} rmse={mean_loss:.4f} "
228+
f"step_time={mean_step_time:.3f}s",
229+
flush=True,
230+
)
231+
232+
if dist.is_initialized():
233+
dist.barrier()
234+
time.sleep(args.sleep_seconds)
235+
236+
if rank == 0 and run is not None:
237+
time.sleep(max(args.gpu_log_interval, 1.0) + 1.0)
238+
trackio.finish()
239+
240+
cleanup_distributed()
241+
242+
243+
if __name__ == "__main__":
244+
run_workload(parse_args())

tests/e2e-local/test_basic_logging.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def test_reserved_keys_are_renamed(temp_dir):
120120

121121

122122
def test_auto_log_gpu(temp_dir):
123-
def fake_gpu_metrics(device=None):
123+
def fake_gpu_metrics(device=None, all_gpus=False):
124124
return {
125125
"gpu/0/utilization": 75,
126126
"gpu/0/allocated_memory": 4.5,
@@ -131,7 +131,7 @@ def fake_gpu_metrics(device=None):
131131
}
132132

133133
with patch.object(gpu, "collect_gpu_metrics", fake_gpu_metrics):
134-
with patch.object(gpu, "get_gpu_count", return_value=(1, [0])):
134+
with patch.object(gpu, "get_all_gpu_count", return_value=(1, [0])):
135135
with patch("trackio.run.gpu_available", return_value=True):
136136
with patch("trackio.run.apple_gpu_available", return_value=False):
137137
trackio.init(
@@ -156,3 +156,49 @@ def fake_gpu_metrics(device=None):
156156
assert log["gpu/0/power"] == 150.0
157157
assert log["gpu/mean_utilization"] == 75
158158
assert "timestamp" in log
159+
160+
161+
def test_auto_log_gpu_multi(temp_dir):
162+
def fake_gpu_metrics(device=None, all_gpus=False):
163+
metrics = {
164+
"gpu/0/utilization": 75,
165+
"gpu/0/allocated_memory": 4.5,
166+
"gpu/0/total_memory": 12.0,
167+
"gpu/0/temp": 65,
168+
"gpu/0/power": 150.0,
169+
"gpu/mean_utilization": 70,
170+
}
171+
if all_gpus:
172+
metrics.update(
173+
{
174+
"gpu/1/utilization": 65,
175+
"gpu/1/allocated_memory": 3.0,
176+
"gpu/1/total_memory": 12.0,
177+
"gpu/1/temp": 60,
178+
"gpu/1/power": 120.0,
179+
}
180+
)
181+
return metrics
182+
183+
with patch.object(gpu, "collect_gpu_metrics", fake_gpu_metrics):
184+
with patch.object(gpu, "get_all_gpu_count", return_value=(2, [0, 1])):
185+
with patch("trackio.run.gpu_available", return_value=True):
186+
with patch("trackio.run.apple_gpu_available", return_value=False):
187+
trackio.init(
188+
project="test_gpu_multi",
189+
name="test_gpu_multi_run",
190+
auto_log_gpu=True,
191+
gpu_log_interval=0.1,
192+
)
193+
trackio.log({"loss": 0.5})
194+
time.sleep(0.3)
195+
trackio.finish()
196+
197+
system_logs = SQLiteStorage.get_system_logs(
198+
project="test_gpu_multi", run="test_gpu_multi_run"
199+
)
200+
assert len(system_logs) >= 1
201+
log = system_logs[0]
202+
assert log["gpu/0/utilization"] == 75
203+
assert log["gpu/1/utilization"] == 65
204+
assert log["gpu/mean_utilization"] == 70

0 commit comments

Comments
 (0)