|
| 1 | +""" |
| 2 | +Demonstrates Trackio's run resume behavior when a job crashes and restarts with |
| 3 | +the same human-readable run name. |
| 4 | +
|
| 5 | +Usage: |
| 6 | + python examples/crash-and-resume-same-run-name.py |
| 7 | + python examples/crash-and-resume-same-run-name.py --resume never |
| 8 | + python examples/crash-and-resume-same-run-name.py --resume allow |
| 9 | + python examples/crash-and-resume-same-run-name.py --resume must |
| 10 | +
|
| 11 | +This example runs both phases in a single invocation: |
| 12 | +- phase 1 always starts a fresh run and logs 20 steps |
| 13 | +- a simulated crash interrupts the job |
| 14 | +- phase 2 restarts the job with the configured resume mode and logs 100 more steps |
| 15 | +
|
| 16 | +The restart behavior is controlled by `--resume`: |
| 17 | +- `never`: restart creates a second run with the same name and a new run_id |
| 18 | +- `allow`: restart resumes the latest run with that name if it exists |
| 19 | +- `must`: restart must resume an existing run with that name |
| 20 | +""" |
| 21 | + |
| 22 | +import argparse |
| 23 | +import math |
| 24 | +import uuid |
| 25 | +import warnings |
| 26 | + |
| 27 | +warnings.filterwarnings( |
| 28 | + "ignore", |
| 29 | + category=SyntaxWarning, |
| 30 | + module=r"pydub\.utils", |
| 31 | +) |
| 32 | + |
| 33 | +import trackio # noqa: E402 |
| 34 | + |
| 35 | +DEFAULT_PROJECT = f"crash-and-resume-demo-{uuid.uuid4().hex[:8]}" |
| 36 | +DEFAULT_RUN_NAME = "trainer-job-42" |
| 37 | +DEFAULT_CRASH_STEPS = 50 |
| 38 | +DEFAULT_RESTART_STEPS = 100 |
| 39 | + |
| 40 | + |
| 41 | +def parse_args() -> argparse.Namespace: |
| 42 | + parser = argparse.ArgumentParser() |
| 43 | + parser.add_argument("--project", default=DEFAULT_PROJECT) |
| 44 | + parser.add_argument("--run-name", default=DEFAULT_RUN_NAME) |
| 45 | + parser.add_argument("--crash-steps", type=int, default=DEFAULT_CRASH_STEPS) |
| 46 | + parser.add_argument("--restart-steps", type=int, default=DEFAULT_RESTART_STEPS) |
| 47 | + parser.add_argument( |
| 48 | + "--resume", |
| 49 | + choices=["never", "allow", "must"], |
| 50 | + default="never", |
| 51 | + help="Resume mode used for the simulated restart phase.", |
| 52 | + ) |
| 53 | + return parser.parse_args() |
| 54 | + |
| 55 | + |
| 56 | +def log_phase( |
| 57 | + start_step: int, num_steps: int, start_loss: float, end_loss: float |
| 58 | +) -> None: |
| 59 | + print(f"Logging steps {start_step}..{start_step + num_steps - 1}") |
| 60 | + for offset in range(num_steps): |
| 61 | + progress = offset / max(1, num_steps - 1) |
| 62 | + loss = ( |
| 63 | + start_loss |
| 64 | + + ((end_loss - start_loss) * progress) |
| 65 | + + (0.01 * math.sin(offset / 6)) |
| 66 | + ) |
| 67 | + accuracy = ( |
| 68 | + 0.25 |
| 69 | + + (0.7 * (1 - (loss / max(start_loss, 0.01)))) |
| 70 | + + (0.02 * math.cos(offset / 9)) |
| 71 | + ) |
| 72 | + trackio.log( |
| 73 | + { |
| 74 | + "loss": round(loss, 4), |
| 75 | + "accuracy": round(max(0.0, min(0.999, accuracy)), 4), |
| 76 | + "phase_progress": offset + 1, |
| 77 | + }, |
| 78 | + step=None, |
| 79 | + ) |
| 80 | + |
| 81 | + |
| 82 | +def start_run( |
| 83 | + project: str, |
| 84 | + run_name: str, |
| 85 | + resume: str, |
| 86 | + phase: str, |
| 87 | + crash_steps: int, |
| 88 | + restart_steps: int, |
| 89 | +): |
| 90 | + run = trackio.init( |
| 91 | + project=project, |
| 92 | + name=run_name, |
| 93 | + resume=resume, |
| 94 | + config={ |
| 95 | + "phase": phase, |
| 96 | + "resume_mode": resume, |
| 97 | + "crash_steps": crash_steps, |
| 98 | + "restart_steps": restart_steps, |
| 99 | + }, |
| 100 | + ) |
| 101 | + print(f"Trackio run name: {run.name}") |
| 102 | + print(f"Trackio run id: {run.id}") |
| 103 | + print(f"Phase: {phase}") |
| 104 | + print(f"Resume mode: {resume}") |
| 105 | + return run |
| 106 | + |
| 107 | + |
| 108 | +def main() -> None: |
| 109 | + args = parse_args() |
| 110 | + |
| 111 | + print("=== phase 1: start fresh run ===") |
| 112 | + first_run = start_run( |
| 113 | + project=args.project, |
| 114 | + run_name=args.run_name, |
| 115 | + resume="never", |
| 116 | + phase="crash", |
| 117 | + crash_steps=args.crash_steps, |
| 118 | + restart_steps=args.restart_steps, |
| 119 | + ) |
| 120 | + log_phase(start_step=0, num_steps=args.crash_steps, start_loss=0.7, end_loss=0.6) |
| 121 | + trackio.finish() |
| 122 | + |
| 123 | + print(f"Simulated crash after {args.crash_steps} steps. Restarting the job now.") |
| 124 | + |
| 125 | + print("=== phase 2: restart job ===") |
| 126 | + restarted_run = start_run( |
| 127 | + project=args.project, |
| 128 | + run_name=args.run_name, |
| 129 | + resume=args.resume, |
| 130 | + phase="restart", |
| 131 | + crash_steps=args.crash_steps, |
| 132 | + restart_steps=args.restart_steps, |
| 133 | + ) |
| 134 | + log_phase( |
| 135 | + start_step=args.crash_steps, |
| 136 | + num_steps=args.restart_steps, |
| 137 | + start_loss=0.7, |
| 138 | + end_loss=0.2, |
| 139 | + ) |
| 140 | + trackio.finish() |
| 141 | + |
| 142 | + resumed_same_run = restarted_run.id == first_run.id |
| 143 | + print(f"Restart reused original run id: {resumed_same_run}") |
| 144 | + print(f"Project: {args.project}") |
| 145 | + print("Done. Open the dashboard to inspect the resulting run list and charts.") |
| 146 | + |
| 147 | + |
| 148 | +if __name__ == "__main__": |
| 149 | + main() |
0 commit comments