pytorch
diff --git a/‎docs/api/settings.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/api/settings.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/deployment_autotuning.md‎
Lines changed: 24 additions & 0 deletions b/‎docs/deployment_autotuning.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎helion/_testing.py‎
Lines changed: 22 additions & 0 deletions b/‎helion/_testing.py‎
Lines changed: 22 additions & 0 deletions
@@ -197,6 +197,13 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:
 
    Users can still override individual ``autotune_*`` settings; explicit values win over the preset. Controlled by ``HELION_AUTOTUNE_EFFORT``.
 
+.. autoattribute:: Settings.autotune_checkpoint_id
+
+   Checkpoint ID for resuming autotuning from a previous checkpoint. When set, the autotuner attempts to load
+   state from a checkpoint file matching this ID, allowing long-running autotuning sessions to be interrupted
+   and resumed. The checkpoint ID contains a hash prefix that identifies the kernel, hardware, and input shapes.
+   If the hash doesn't match, a ``CheckpointError`` is raised.
+   Controlled by ``HELION_AUTOTUNE_CHECKPOINT_ID``.
 
 ```
 
@@ -295,6 +302,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_AUTOTUNE_PROGRESS_BAR`` | ``autotune_progress_bar`` | Enable or disable the progress bar UI during autotuning. |
 | ``HELION_AUTOTUNE_IGNORE_ERRORS`` | ``autotune_ignore_errors`` | Continue autotuning even when recoverable runtime errors occur. |
 | ``HELION_AUTOTUNE_CONFIG_OVERRIDES`` | ``autotune_config_overrides`` | Supply JSON forcing particular autotuner config key/value pairs. |
+| ``HELION_AUTOTUNE_CHECKPOINT_ID`` | ``autotune_checkpoint_id`` | Checkpoint ID for resuming autotuning from a previous checkpoint. |
 | ``HELION_CACHE_DIR`` | ``LocalAutotuneCache`` | Override the on-disk directory used for cached autotuning artifacts. |
 | ``HELION_SKIP_CACHE`` | ``LocalAutotuneCache`` | When set to ``1``, ignore cached autotuning entries and rerun searches. |
 | ``HELION_ASSERT_CACHE_HIT`` | ``AutotuneCacheBase`` | When set to ``1``, require a cache hit; raises ``CacheAssertionError`` on cache miss with detailed diagnostics. |
 
@@ -104,6 +104,30 @@ tuning time versus coverage, or try different search algorithms.
 need more reproducibility; see {doc}`api/settings`.  Note this only
 affects which configs are tried, not the timing results.
 
+### Checkpointing Long-Running Autotuning
+
+For very long autotuning sessions, you can save and resume state using
+checkpoints. This is useful when tuning might be interrupted (e.g., preemptible
+instances) or when you want to continue tuning from a previous unfinished run.
+
+The simplest approach is to use the `HELION_AUTOTUNE_CHECKPOINT_ID` environment
+variable. When autotuning runs, it periodically saves checkpoints and logs the
+checkpoint ID. To resume, set this environment variable to the checkpoint ID
+from a previous run.
+
+```bash
+# First run - autotuning will log checkpoint IDs as it progresses:
+# "Checkpoint saved: .../autotuner_checkpoints/a1b2c3d4_1706123456_e5f6g7h8.checkpoint"
+# "To resume from this checkpoint, set HELION_AUTOTUNE_CHECKPOINT_ID=a1b2c3d4_1706123456_e5f6g7h8 ..."
+python run_kernel.py
+
+# If interrupted, resume from the last checkpoint:
+HELION_AUTOTUNE_CHECKPOINT_ID=a1b2c3d4_1706123456_e5f6g7h8 python run_kernel.py
+```
+
+The checkpoint ID contains a hash prefix that identifies the kernel, hardware,
+and input shapes. If the hash doesn't match, a `CheckpointError` is raised.
+
 ## Deploy a Single Config
 
 If one configuration wins for every production call, bake it into the decorator:
 
@@ -10,13 +10,15 @@
 import operator
 import os
 from pathlib import Path
+import random
 import re
 import sys
 from typing import TYPE_CHECKING
 from typing import Callable
 from typing import Generator
 import unittest
 
+import numpy as np
 import pytest
 import torch
 from torch.utils._pytree import tree_map
@@ -40,6 +42,26 @@
     from .runtime.kernel import Kernel
 
 
+def seed_rng(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)  # noqa: NPY002
+    torch.manual_seed(seed)
+
+
+@contextlib.contextmanager
+def fork_rng() -> Generator[None, None, None]:
+    """Context manager that forks all RNGs and restores original state on exit."""
+    python_state = random.getstate()
+    numpy_state = np.random.get_state()  # noqa: NPY002
+
+    with torch.random.fork_rng():
+        try:
+            yield
+        finally:
+            random.setstate(python_state)
+            np.random.set_state(numpy_state)  # noqa: NPY002
+
+
 def _strip_launcher_args(value: str) -> str:
     strip_pairs = []
     if supports_amd_cdna_tunables():