pytorch
diff --git a/‎docs/api/settings.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/api/settings.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/deployment_autotuning.md‎
Lines changed: 23 additions & 0 deletions b/‎docs/deployment_autotuning.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎helion/_testing.py‎
Lines changed: 1 addition & 0 deletions b/‎helion/_testing.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎helion/autotuner/base_search.py‎
Lines changed: 210 additions & 0 deletions b/‎helion/autotuner/base_search.py‎
Lines changed: 210 additions & 0 deletions
diff --git a/‎helion/autotuner/de_surrogate_hybrid.py‎
Lines changed: 12 additions & 6 deletions b/‎helion/autotuner/de_surrogate_hybrid.py‎
Lines changed: 12 additions & 6 deletions
@@ -209,6 +209,14 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:
    Each preset also sets a default initial population strategy (see :doc:`../deployment_autotuning` for details).
    Users can still override individual ``autotune_*`` settings; explicit values win over the preset. Controlled by ``HELION_AUTOTUNE_EFFORT``.
 
+.. autoattribute:: Settings.autotune_checkpoint_dir
+
+   Directory path for saving and resuming autotuning checkpoints. When set, the autotuner
+   saves in-progress state to ``{dir}/{stable_hash}.pt`` and auto-discovers matching
+   checkpoints on subsequent runs. The checkpoint file is deleted on successful completion.
+   When unset (default), no checkpoints are saved or loaded (opt-in).
+   Controlled by ``HELION_AUTOTUNE_CHECKPOINT_DIR``.
+
 .. autoattribute:: Settings.autotune_best_available_max_configs
 
    Maximum number of cached configs to use when seeding the initial population with the ``from_best_available`` strategy.
@@ -323,6 +331,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"LFBOTreeSearch"`` (default),
 | ``HELION_AUTOTUNE_PROGRESS_BAR`` | ``autotune_progress_bar`` | Enable or disable the progress bar UI during autotuning. |
 | ``HELION_AUTOTUNE_IGNORE_ERRORS`` | ``autotune_ignore_errors`` | Continue autotuning even when recoverable runtime errors occur. |
 | ``HELION_AUTOTUNE_CONFIG_OVERRIDES`` | ``autotune_config_overrides`` | Supply JSON forcing particular autotuner config key/value pairs. |
+| ``HELION_AUTOTUNE_CHECKPOINT_DIR`` | ``autotune_checkpoint_dir`` | Directory path for saving/resuming autotuning checkpoints (opt-in). |
 | ``TRITON_STORE_BINARY_ONLY`` | Triton (autotuning) | Set to ``1`` during autotuning to skip Triton intermediate IRs, reducing cache size ~40%. Set to ``0`` to retain IRs for debugging. |
 | ``HELION_CACHE_DIR`` | ``LocalAutotuneCache`` | Override the on-disk directory used for cached autotuning artifacts. |
 | ``HELION_SKIP_CACHE`` | ``LocalAutotuneCache`` | When set to ``1``, skip both reading and writing the autotuning cache entirely. |
 
@@ -183,6 +183,29 @@ Related settings for `from_best_available` (see {doc}`api/settings`):
 | `autotune_best_available_max_configs` | `HELION_BEST_AVAILABLE_MAX_CONFIGS` | 20 | Maximum cached configs to seed |
 | `autotune_best_available_max_cache_scan` | `HELION_BEST_AVAILABLE_MAX_CACHE_SCAN` | 500 | Maximum cache files to scan |
 
+### Checkpointing Long-Running Autotuning
+
+For very long autotuning sessions, you can save and resume state using
+checkpoints. This is useful when tuning might be interrupted (e.g., preemptible
+instances) or when you want to continue tuning from a previous unfinished run.
+
+Set the `HELION_AUTOTUNE_CHECKPOINT_DIR` environment variable to a directory
+path. The autotuner will periodically save checkpoints there, keyed by the
+kernel's stable hash. If interrupted, re-run with the same directory to resume
+automatically. On successful completion, the checkpoint file is cleaned up.
+
+```bash
+# Enable checkpointing to a directory:
+HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/helion_checkpoints python run_kernel.py
+
+# If interrupted, just re-run with the same directory to resume:
+HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/helion_checkpoints python run_kernel.py
+```
+
+Without `HELION_AUTOTUNE_CHECKPOINT_DIR`, no checkpoints are saved (opt-in).
+Multiple kernels can safely use the same directory — each kernel writes to a
+file named by its unique stable hash.
+
 ## Deploy a Single Config
 
 If one configuration wins for every production call, bake it into the decorator:
 
@@ -57,6 +57,7 @@
     from .runtime.kernel import Kernel
 
 
+
 def _strip_launcher_args(value: str) -> str:
     strip_pairs = []
     if supports_amd_cdna_tunables():
 
@@ -12,6 +12,8 @@
 import math
 from math import inf
 import os
+from pathlib import Path
+import pickle
 import pprint
 import random
 import re
@@ -343,6 +345,15 @@ def __init__(self, kernel: _AutotunableKernel, args: Sequence[object]) -> None:
         self.args: Sequence[object] = args
         self.log = AutotuningLogger(self.settings)
         self.best_perf_so_far = inf
+        self._current_generation = 0
+        self.counters: collections.Counter[str] = collections.Counter()
+        self._autotune_metrics: AutotuneMetrics = AutotuneMetrics(
+            kernel_name="",
+            input_shapes="",
+            hardware="",
+            random_seed=0,
+            search_algorithm=type(self).__name__,
+        )
         self._prepared = False
         self._precompile_tmpdir: tempfile.TemporaryDirectory[str] | None = None
         self._precompile_args_path: str | None = None
@@ -406,6 +417,90 @@ def cleanup(self) -> None:
         self._precompile_args_path = None
         self._precompile_result_counter = count()
 
+    # Fields excluded from pickle checkpoints: unpicklable infrastructure,
+    # fields recomputed by _prepare(), and fields loaded separately.
+    _CHECKPOINT_EXCLUDE = frozenset(
+        {
+            # Unpicklable infrastructure
+            "kernel",
+            "args",
+            "log",
+            "settings",
+            "config_spec",
+            "_precompile_tmpdir",
+            "_precompile_args_path",
+            "_precompile_result_counter",
+            # Recomputed by _prepare() before checkpoint load
+            "_baseline_output",
+            "_baseline_post_args",
+            "_mutated_arg_indices",
+            "_effective_atol",
+            "_effective_rtol",
+            "_jobs",
+            "_autotune_metrics",
+            "_prepared",
+            "_skip_cache",
+            # Loaded separately via _load_crashed_configs()
+            "_crashed_config_strs",
+        }
+    )
+
+    def __getstate__(self) -> dict[str, Any]:
+        return {
+            k: v for k, v in self.__dict__.items() if k not in self._CHECKPOINT_EXCLUDE
+        }
+
+    _stable_hash: str | None = None
+
+    def _get_stable_hash(self) -> str:
+        """Get the full stable hash for this kernel's cache key (cached)."""
+        if self._stable_hash is None:
+            from .local_cache import LocalAutotuneCache
+
+            self._stable_hash = LocalAutotuneCache(self)._generate_key().stable_hash()
+        return self._stable_hash
+
+    def _try_load_checkpoint(self) -> bool:
+        """Attempt to load checkpoint from checkpoint dir. Returns True if successful."""
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is None:
+            return False
+
+        checkpoint_dir = Path(checkpoint_dir_str)
+        stable_hash = self._get_stable_hash()
+        checkpoint_file = checkpoint_dir / f"{stable_hash}.pt"
+
+        if not checkpoint_file.exists():
+            return False  # No matching checkpoint; start fresh
+
+        # Matching file exists, attempt to load
+        self.log(f"Resuming from checkpoint: {checkpoint_file}")
+        try:
+            with open(checkpoint_file, "rb") as f:
+                loaded = pickle.load(f)
+        except Exception as e:
+            raise exc.CheckpointError(
+                f"Failed to load checkpoint file '{checkpoint_file}': {e}\n"
+                f"The file may be corrupted. Delete it to start fresh."
+            ) from e
+
+        # Validate stable hash matches (guards against renamed/copied files)
+        loaded_hash = getattr(loaded, "_stable_hash", None)
+        if loaded_hash is not None and loaded_hash != self._get_stable_hash():
+            raise exc.CheckpointError(
+                "Checkpoint is incompatible: kernel, hardware, or input shapes "
+                "may have changed."
+            )
+
+        # Copy loaded search state into self (self already has kernel, args,
+        # log, etc. from __init__ and _prepare())
+        self.__dict__.update(loaded.__dict__)
+        self._recompile_after_checkpoint()
+        self.log(f"Resumed at generation {self._current_generation}")
+        return True
+
+    def _recompile_after_checkpoint(self) -> None:
+        """Recompile after loading a checkpoint. Override in subclasses."""
     def _compute_baseline(
         self,
     ) -> tuple[object, Sequence[int], Sequence[object] | None]:
@@ -629,6 +724,7 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
             The performance of the configuration in ms.
         """
         self._autotune_metrics.num_configs_tested += 1
+        self.counters["benchmark"] += 1
         self.log.debug(lambda: f"Running benchmark for {config!r}")
         _captured_output: list[str] = [""]
         _capture_ctx = (
@@ -1089,8 +1185,12 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
                 torch.save(self.args, args_path)
                 self._precompile_args_path = args_path
             exit_stack.callback(self.cleanup)
+
+            if not self._try_load_checkpoint():
+                self._init_search()
             try:
                 best = self._autotune()
+                self._cleanup_checkpoint()
             finally:
                 self._finalize_autotune_metrics()
         end = time.perf_counter()
@@ -1112,6 +1212,16 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
                 print(triton_code, file=sys.stderr)
         return best
 
+    def _init_search(self) -> None:
+        """
+        Initialize the search state for a fresh autotuning run.
+
+        This method is called when starting autotuning without a checkpoint.
+        Subclasses should override this to set up initial population and state.
+        After this method, _current_generation should be set to the generation
+        that _autotune() should start its loop from.
+        """
+
     def _autotune(self) -> Config:
         """
         Abstract method to perform the actual autotuning.
@@ -1123,6 +1233,68 @@ def _autotune(self) -> Config:
         """
         raise NotImplementedError
 
+    def save_checkpoint(self) -> Path | None:
+        """
+        Save current autotuner state to checkpoint file.
+
+        Only saves when autotune_checkpoint_dir is set (opt-in).
+        Overwrites the same file each generation (keyed by stable hash).
+        Uses pickle to serialize the entire autotuner object (minus unpicklable
+        fields excluded by __getstate__).
+
+        Returns:
+            Path to saved checkpoint file, or None if not saved
+        """
+        from ..runtime.kernel import BoundKernel
+
+        # External kernels don't support caching/checkpointing
+        if not isinstance(self.kernel, BoundKernel):
+            return None
+
+        if not self.kernel.is_cacheable():
+            return None
+
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is None:
+            return None  # Opt-in: no dir set, no saving
+
+        stable_hash = self._get_stable_hash()
+        checkpoint_dir = Path(checkpoint_dir_str)
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        checkpoint_path = checkpoint_dir / f"{stable_hash}.pt"
+
+        # Atomic write using temp file + rename
+        tmp = checkpoint_dir / f".tmp.{stable_hash}.{os.getpid()}"
+        with open(tmp, "wb") as f:
+            pickle.dump(self, f)
+        os.replace(tmp, checkpoint_path)
+
+        self.log(f"Checkpoint saved: {checkpoint_path}")
+        return checkpoint_path
+
+    def _cleanup_checkpoint(self) -> None:
+        """Delete checkpoint file on successful autotune completion.
+
+        Checkpoints are ephemeral in-progress state. Once autotuning
+        completes successfully, the result is cached normally and the
+        checkpoint is no longer needed.
+        """
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is None:
+            return
+
+        stable_hash = self._get_stable_hash()
+        checkpoint_file = Path(checkpoint_dir_str) / f"{stable_hash}.pt"
+        if checkpoint_file.exists():
+            checkpoint_file.unlink()
+            self.log(f"Checkpoint cleaned up: {checkpoint_file}")
+
+        # Clean up crash-recovery artifacts
+        for suffix in (".pending_config", ".crashed_configs"):
+            artifact = Path(checkpoint_dir_str) / f"{stable_hash}{suffix}"
+            if artifact.exists():
+                artifact.unlink()
+
     def set_generation(self, generation: int) -> None:
         self._autotune_metrics.num_generations = generation
 
@@ -1177,6 +1349,15 @@ class PopulationMember:
     def perf(self) -> float:
         return self.perfs[-1]
 
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__.copy()
+        state["fn"] = None  # compiled functions are not picklable
+        return state
+
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        self.__dict__.update(state)
+        self.fn = _unset_fn
+
 
 def performance(member: PopulationMember) -> float:
     """
@@ -1570,6 +1751,14 @@ def rebenchmark_population(
             members = self.population
         self.rebenchmark([p for p in members if self.should_rebenchmark(p)], desc=desc)
 
+    def set_generation(self, generation: int) -> None:
+        if generation == self._current_generation:
+            return
+        self._current_generation = generation
+        super().set_generation(generation)
+        if generation > 0:
+            self.save_checkpoint()
+
     def statistics(self) -> str:
         """
         Generate statistics for the current population.
@@ -1579,6 +1768,27 @@ def statistics(self) -> str:
         """
         return population_statistics(self.population)
 
+    def _recompile_after_checkpoint(self) -> None:
+        """Recompile kernel functions for population members after checkpoint load."""
+        recompile_failures: list[tuple[PopulationMember, str]] = []
+        for member in self.population:
+            if member.fn is _unset_fn and member.status == "ok":
+                try:
+                    member.fn = self.kernel.compile_config(
+                        member.config, allow_print=False
+                    )
+                except Exception as e:
+                    member.fn = _unset_fn
+                    member.status = "error"
+                    member.perfs.append(inf)  # Ensure member won't be selected as best
+                    recompile_failures.append((member, str(e)))
+
+        if recompile_failures:
+            self.log(
+                f"Warning: {len(recompile_failures)} config(s) failed to recompile "
+                f"and will be skipped. First failure: {recompile_failures[0][1]}"
+            )
+
     def run_finishing_phase(
         self, best: PopulationMember, rounds: int
     ) -> PopulationMember:
 
@@ -135,12 +135,9 @@ def __init__(
         # Track all evaluations for surrogate training
         self.all_observations: list[tuple[FlatConfig, float]] = []
 
-    def _autotune(self) -> Config:
+    def _init_search(self) -> None:
         """
-        Run DE with surrogate-assisted selection.
-
-        Returns:
-            Best configuration found
+        Initialize DE with surrogate-assisted selection.
         """
         self.log("=" * 70)
         self.log("Differential Evolution with Surrogate-Assisted Selection")
@@ -174,8 +171,17 @@ def _autotune(self) -> Config:
         self.best_perf_history = [self.best.perf]
         self.generations_without_improvement = 0
 
+        self.set_generation(2)
+
+    def _autotune(self) -> Config:
+        """
+        Run DE with surrogate-assisted selection.
+
+        Returns:
+            Best configuration found
+        """
         # Evolution loop
-        for gen in range(2, self.max_generations + 1):
+        for gen in range(self._current_generation, self.max_generations + 1):
             self.set_generation(gen)
             self._evolve_generation(gen)