[Autotuner] Add crash recovery bash script for unrecoverable CUDA errors

yf225 · yf225 · commit f8e06ac9023d · 2026-04-01T23:47:17.000-07:00
Add scripts/autotune_with_crash_recovery.sh — a bash wrapper that automatically recovers from CUDA errors (illegal memory access, misaligned address, etc.) that poison the GPU context and kill the autotuning process. How it works: - Before each benchmark, the autotuner writes the current config to a pending file (_pending_config.txt) in the checkpoint directory - If a CUDA error kills the process, the pending file survives on disk - The bash script detects it, appends the poison config to _bad_configs.txt, and re-launches the command from scratch - On re-launch, the autotuner loads its checkpoint + bad configs list, skips the poison config, and continues searching Usage: scripts/autotune_with_crash_recovery.sh \ --checkpoint-dir /tmp/ckpt -- python train.py stack-info: PR: #1921, branch: yf225/stack/91
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -435,6 +435,7 @@ def __init__(self, kernel: _AutotunableKernel, args: Sequence[object]) -> None:
         self._precompile_tmpdir: tempfile.TemporaryDirectory[str] | None = None
         self._precompile_args_path: str | None = None
         self._precompile_result_counter = count()
+        self._bad_config_strs: set[str] = set()
 
     def _prepare(self) -> None:
         """Some initialization deferred until autotuning actually runs.
@@ -531,9 +532,53 @@ def _try_load_checkpoint(self) -> bool:
         # load_state_dict validates required keys and raises CheckpointError for issues
         self.load_state_dict(state)
 
+        # Load bad configs (from subprocess crash recovery)
+        self._load_bad_configs()
+
         self.log(f"Resumed at generation {self._current_generation}")
         return True
 
+    def _load_bad_configs(self) -> None:
+        """Load bad configs from _bad_configs.txt file."""
+        from .subprocess_runner import load_bad_configs
+
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is not None:
+            bad_configs_path = os.path.join(checkpoint_dir_str, "_bad_configs.txt")
+            self._bad_config_strs |= load_bad_configs(bad_configs_path)
+
+        if self._bad_config_strs:
+            self.log(
+                f"Loaded {len(self._bad_config_strs)} bad config(s) to skip",
+            )
+
+    def _write_pending_config(self, config_str: str) -> None:
+        """Write the config being benchmarked to the pending file."""
+        from .subprocess_runner import write_pending
+
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is None:
+            return
+        write_pending(checkpoint_dir_str, config_str)
+
+    def _clear_pending_config(self) -> None:
+        """Remove the pending file after benchmark completes."""
+        from .subprocess_runner import clear_pending
+
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is None:
+            return
+        clear_pending(checkpoint_dir_str)
+
+    def _bump_progress(self) -> None:
+        """Increment the configs-tested counter for crash recovery progress tracking."""
+        from .subprocess_runner import bump_progress
+
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is None:
+            return
+        bump_progress(checkpoint_dir_str)
+
     def _compute_baseline(
         self,
     ) -> tuple[object, Sequence[int], Sequence[object] | None]:
@@ -752,9 +797,16 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
         Returns:
             The performance of the configuration in ms.
         """
+        # Skip configs that previously crashed the subprocess
+        config_str = str(config)
+        if config_str in self._bad_config_strs:
+            self.log.warning(f"Skipping known-bad config: {config}")
+            return inf
+
         self._autotune_metrics.num_configs_tested += 1
         self.counters["benchmark"] += 1
         self.log.debug(lambda: f"Running benchmark for {config!r}")
+        self._write_pending_config(config_str)
         _captured_output: list[str] = [""]
         _capture_ctx = (
             capture_output()
@@ -794,6 +846,7 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
             if not compile_success_all:
                 return inf
 
+        _is_unrecoverable = False
         try:
             # TODO(jansel): early exit with fewer trials if early runs are slow
             self.log.debug(lambda: f"Running {config} at {datetime.datetime.now()}")
@@ -855,6 +908,7 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
                 captured_output=_captured_output[0] or None,
             )
             if match_unrecoverable_runtime_error(e):
+                _is_unrecoverable = True
                 self.kernel.maybe_log_repro(self.log.error, self.args, config)
                 raise exc.TritonUnrecoverableRuntimeError(
                     reason=str(e),
@@ -908,6 +962,10 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
 
             self._autotune_metrics.num_compile_failures += 1
             return inf
+        finally:
+            if not _is_unrecoverable:
+                self._clear_pending_config()
+                self._bump_progress()
 
     def set_adaptive_compile_timeout(
         self,
@@ -1193,6 +1251,8 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
             exit_stack.callback(self.cleanup)
 
             if not self._try_load_checkpoint():
+                # Load bad configs even on fresh starts (subprocess recovery)
+                self._load_bad_configs()
                 self._init_search()
             try:
                 best = self._autotune()
@@ -1296,6 +1356,11 @@ def _cleanup_checkpoint(self) -> None:
             checkpoint_file.unlink()
             self.log(f"Checkpoint cleaned up: {checkpoint_file}")
 
+        # Clean up subprocess recovery artifacts
+        from .subprocess_runner import cleanup_subprocess_artifacts
+
+        cleanup_subprocess_artifacts(checkpoint_dir_str)
+
     @staticmethod
     def _serialize_numpy_rng_state(
         state: tuple[str, Any, int, int, float],
diff --git a/helion/autotuner/logger.py b/helion/autotuner/logger.py
@@ -466,15 +466,25 @@ def format_triton_compile_failure(
     )
 )
 
+# CUDA errors that poison the GPU context and require process restart.
+# Source: CUDA driver_types.h — all errors documented with
+# "To continue using CUDA, the process must be terminated and relaunched."
+# Substrings are matched case-insensitively against cudaGetErrorString output.
 _UNRECOVERABLE_RUNTIME_ERROR_RE: re.Pattern[str] = re.compile(
     "|".join(
         map(
             re.escape,
             [
-                "illegal memory access",
-                "misaligned address",
-                "unspecified launch failure",
-                "illegal instruction",
+                "illegal memory access",  # cudaErrorIllegalAddress (700)
+                "misaligned address",  # cudaErrorMisalignedAddress (716)
+                "unspecified launch failure",  # cudaErrorLaunchFailure (719)
+                "illegal instruction",  # cudaErrorIllegalInstruction (715)
+                "device-side assert",  # cudaErrorAssert (710)
+                "hardware stack error",  # cudaErrorHardwareStackError (714)
+                "invalid program counter",  # cudaErrorInvalidPc (718)
+                "not supported on global/shared address space",  # cudaErrorInvalidAddressSpace (717)
+                "tensor memory not completely freed",  # cudaErrorTensorMemoryLeak (721)
+                "launch timed out",  # cudaErrorLaunchTimeout (702)
             ],
         )
     ),
diff --git a/helion/autotuner/subprocess_runner.py b/helion/autotuner/subprocess_runner.py
@@ -0,0 +1,93 @@
+"""File I/O helpers for autotuner crash recovery.
+
+The crash recovery protocol works with an external retry loop
+(scripts/autotune_with_crash_recovery.sh). Before benchmarking each
+config, the autotuner writes its string representation to a pending
+file. If the process crashes (e.g. CUDA illegal memory access), the
+pending file survives and the external retry loop records it as a bad
+config. On re-run, the autotuner loads the checkpoint + bad configs
+and skips the poison config.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+_PENDING_FILENAME = "_pending_config.txt"
+_BAD_CONFIGS_FILENAME = "_bad_configs.txt"
+
+
+def write_pending(checkpoint_dir: str, config_str: str) -> None:
+    """Write the config being benchmarked to the pending file."""
+    pending_path = Path(checkpoint_dir) / _PENDING_FILENAME
+    pending_path.write_text(config_str)
+
+
+def clear_pending(checkpoint_dir: str) -> None:
+    """Remove the pending file after benchmark completes."""
+    pending_path = Path(checkpoint_dir) / _PENDING_FILENAME
+    if pending_path.exists():
+        pending_path.unlink()
+
+
+def load_bad_configs(bad_configs_path: str) -> set[str]:
+    """Load bad config strings from file, one per line."""
+    path = Path(bad_configs_path)
+    if not path.exists():
+        return set()
+    lines = path.read_text().splitlines()
+    return {line.strip() for line in lines if line.strip()}
+
+
+def _append_bad_config(bad_configs_path: str, config_str: str) -> None:
+    """Append a bad config string to the bad configs file."""
+    with open(bad_configs_path, "a") as f:
+        f.write(config_str + "\n")
+        f.flush()
+        os.fsync(f.fileno())
+
+
+_PROGRESS_FILENAME = "_configs_tested.txt"
+
+
+def bump_progress(checkpoint_dir: str) -> None:
+    """Increment the configs-tested counter.
+
+    Called after each benchmark completes (success or recoverable error)
+    without crashing the process. The bash crash-recovery script reads
+    this counter to detect whether the autotuner is making progress.
+    """
+    progress_path = Path(checkpoint_dir) / _PROGRESS_FILENAME
+    count = 0
+    if progress_path.exists():
+        with open(progress_path) as f:
+            try:
+                count = int(f.read().strip())
+            except ValueError:
+                pass
+    progress_path.write_text(str(count + 1))
+
+
+def read_progress(checkpoint_dir: str) -> int:
+    """Read the configs-tested counter. Returns 0 if file doesn't exist."""
+    progress_path = Path(checkpoint_dir) / _PROGRESS_FILENAME
+    if not progress_path.exists():
+        return 0
+    try:
+        return int(progress_path.read_text().strip())
+    except ValueError:
+        return 0
+
+
+def cleanup_subprocess_artifacts(checkpoint_dir: str) -> None:
+    """Remove crash-recovery files in the checkpoint directory."""
+    checkpoint_path = Path(checkpoint_dir)
+    for name in (
+        _PENDING_FILENAME,
+        _BAD_CONFIGS_FILENAME,
+        _PROGRESS_FILENAME,
+    ):
+        artifact = checkpoint_path / name
+        if artifact.exists():
+            artifact.unlink()
diff --git a/scripts/autotune_with_crash_recovery.sh b/scripts/autotune_with_crash_recovery.sh
@@ -0,0 +1,132 @@
+#!/usr/bin/env bash
+# Autotuner crash recovery wrapper.
+#
+# Runs a command (typically a Python script that calls helion autotuning)
+# in a retry loop. When the process crashes due to an unrecoverable CUDA
+# error (illegal memory access, misaligned address, etc.), the autotuner
+# leaves a "_pending_config.txt" breadcrumb in the checkpoint directory.
+# This script detects that file, records the poison config in
+# "_bad_configs.txt", and re-runs the command. On re-run the autotuner
+# loads its checkpoint and skips the bad config.
+#
+# Progress detection:
+#   The autotuner writes a counter to _configs_tested.txt after each
+#   successful benchmark. This script checks whether the counter advanced
+#   between crashes. If it did, the autotuner is making progress and we
+#   keep retrying indefinitely. If the counter doesn't advance for 3
+#   consecutive crashes, the autotuner is stuck and we give up.
+#
+# Requirements:
+#   - HELION_AUTOTUNE_CHECKPOINT_DIR must be set
+#
+# Usage:
+#   HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/ckpt \
+#       scripts/autotune_with_crash_recovery.sh -- COMMAND [ARGS...]
+#
+# Examples:
+#   HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/autotune_ckpt \
+#       scripts/autotune_with_crash_recovery.sh -- python train.py
+
+set -uo pipefail
+
+MAX_NO_PROGRESS=3
+
+# --- Argument parsing ---
+usage() {
+    cat >&2 <<'EOF'
+Usage: HELION_AUTOTUNE_CHECKPOINT_DIR=/path/to/dir \
+           autotune_with_crash_recovery.sh -- COMMAND [ARGS...]
+EOF
+    exit "${1:-1}"
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -h|--help)
+            usage 0
+            ;;
+        --)
+            shift
+            break
+            ;;
+        *)
+            echo "Error: unknown option '$1'" >&2
+            usage 1
+            ;;
+    esac
+done
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: no command specified after --" >&2
+    usage 1
+fi
+
+if [[ -z "${HELION_AUTOTUNE_CHECKPOINT_DIR:-}" ]]; then
+    echo "Error: HELION_AUTOTUNE_CHECKPOINT_DIR must be set." >&2
+    exit 1
+fi
+
+# --- Setup ---
+checkpoint_dir="$HELION_AUTOTUNE_CHECKPOINT_DIR"
+mkdir -p "$checkpoint_dir"
+
+pending_file="$checkpoint_dir/_pending_config.txt"
+bad_configs_file="$checkpoint_dir/_bad_configs.txt"
+progress_file="$checkpoint_dir/_configs_tested.txt"
+
+read_progress() {
+    if [[ -f "$progress_file" ]]; then
+        cat "$progress_file"
+    else
+        echo 0
+    fi
+}
+
+# --- Retry loop ---
+attempt=0
+no_progress_count=0
+last_progress=$(read_progress)
+
+while true; do
+    attempt=$((attempt + 1))
+
+    # Run the user command (don't use set -e, capture exit code manually)
+    "$@"
+    exit_code=$?
+
+    if [[ $exit_code -eq 0 ]]; then
+        exit 0
+    fi
+
+    # Check if the autotuner left a pending config breadcrumb
+    if [[ -f "$pending_file" ]]; then
+        config=$(cat "$pending_file")
+        rm -f "$pending_file"
+        echo "$config" >> "$bad_configs_file"
+
+        # Check progress: did the autotuner test any configs before crashing?
+        current_progress=$(read_progress)
+        if [[ "$current_progress" -gt "$last_progress" ]]; then
+            configs_tested=$((current_progress - last_progress))
+            echo "[crash-recovery] Process crashed (exit code $exit_code, attempt $attempt). Tested $configs_tested config(s) before crash." >&2
+            no_progress_count=0
+            last_progress=$current_progress
+        else
+            no_progress_count=$((no_progress_count + 1))
+            echo "[crash-recovery] Process crashed (exit code $exit_code, attempt $attempt). No configs tested before crash ($no_progress_count/$MAX_NO_PROGRESS consecutive)." >&2
+        fi
+        echo "[crash-recovery] Blocked config: $config" >&2
+
+        if [[ $no_progress_count -ge $MAX_NO_PROGRESS ]]; then
+            echo "[crash-recovery] No progress after $MAX_NO_PROGRESS consecutive crashes — the autotuner appears stuck." >&2
+            echo "[crash-recovery] All bad configs have been recorded. You can re-run this script and it will resume from the latest checkpoint, skipping all previously recorded bad configs." >&2
+            exit 1
+        fi
+
+        echo "[crash-recovery] Restarting from checkpoint..." >&2
+    else
+        # No pending file — this is not a recoverable CUDA crash.
+        # Propagate the original exit code.
+        exit "$exit_code"
+    fi
+done
diff --git a/test/data/autotune_crash_helper.py b/test/data/autotune_crash_helper.py
diff --git a/test/test_autotuner_subprocess.py b/test/test_autotuner_subprocess.py