@@ -350,6 +350,7 @@ def __init__(self, kernel: _AutotunableKernel, args: Sequence[object]) -> None:
350350 self ._precompile_tmpdir : tempfile .TemporaryDirectory [str ] | None = None
351351 self ._precompile_args_path : str | None = None
352352 self ._precompile_result_counter = count ()
353+ self ._crashed_config_strs : set [str ] = set ()
353354
354355 def _prepare (self ) -> None :
355356 """Some initialization deferred until autotuning actually runs.
@@ -739,6 +740,12 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
739740 Returns:
740741 The performance of the configuration in ms.
741742 """
743+ # Skip configs that previously crashed the subprocess
744+ config_str = str (config )
745+ if config_str in self ._crashed_config_strs :
746+ self .log .warning (f"Skipping known-crashed config: { config } " )
747+ return inf
748+
742749 self ._autotune_metrics .num_configs_tested += 1
743750 self .log .debug (lambda : f"Running benchmark for { config !r} " )
744751 _captured_output : list [str ] = ["" ]
@@ -1018,13 +1025,36 @@ def _benchmark(
10181025 A list of BenchmarkResult entries containing the configuration, compiled
10191026 callable, measured performance, status, and compilation time.
10201027 """
1028+ # Filter out known-crashed configs before compilation
1029+ if self ._crashed_config_strs :
1030+ original_len = len (configs )
1031+ configs = [c for c in configs if str (c ) not in self ._crashed_config_strs ]
1032+ skipped = original_len - len (configs )
1033+ if skipped :
1034+ self .log .warning (
1035+ f"Skipped { skipped } known-crashed config(s) before compilation"
1036+ )
1037+ if not configs :
1038+ return []
1039+
10211040 fns : list [Callable [..., object ]] = []
10221041 valid_configs : list [Config ] = []
10231042 futures : list [PrecompileFuture ] | None = None
1043+ pending_path = self ._get_pending_config_path ()
10241044 for i , config in enumerate (configs ):
1045+ # Write sentinel before compile so a hard crash (SIGKILL /
1046+ # CUDA IMA) leaves a trace the crash recovery script can find.
1047+ if pending_path is not None :
1048+ pending_path .write_text (str (config ))
10251049 try :
10261050 fn = self .kernel .compile_config (config , allow_print = False )
1027- except Exception :
1051+ except Exception as e :
1052+ if match_unrecoverable_runtime_error (e ):
1053+ # Leave sentinel for crash recovery — CUDA context is
1054+ # corrupted and the process cannot continue.
1055+ raise
1056+ if pending_path is not None :
1057+ pending_path .unlink (missing_ok = True )
10281058 # If all configs failed, raise error
10291059 if not valid_configs and i == len (configs ) - 1 :
10301060 raise
@@ -1034,9 +1064,14 @@ def _benchmark(
10341064 exc_info = True ,
10351065 )
10361066 continue
1067+ if pending_path is not None :
1068+ pending_path .unlink (missing_ok = True )
10371069 fns .append (fn )
10381070 valid_configs .append (config )
10391071 configs = valid_configs
1072+ # NOTE: precompile runs in separate subprocesses with isolated CUDA
1073+ # contexts; crashes there are caught via is_working checks, not
1074+ # sentinels.
10401075 if self .settings .autotune_precompile :
10411076 futures = list (
10421077 starmap (
@@ -1098,7 +1133,14 @@ def _benchmark(
10981133 )
10991134 )
11001135 # benchmark one-by-one to avoid noisy results
1136+ # Write pending-config sentinel; cleared after benchmark.
1137+ # On crash the file stays so the crash recovery script can
1138+ # detect which config caused the failure.
1139+ if pending_path is not None :
1140+ pending_path .write_text (str (config ))
11011141 perf = self .benchmark_function (config , fn )
1142+ if pending_path is not None :
1143+ pending_path .unlink (missing_ok = True )
11021144 status = "ok" if math .isfinite (perf ) else "error"
11031145 # Log completion after benchmarking
11041146 self .log .record_autotune_entry (
@@ -1204,6 +1246,7 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
12041246 checkpoint_enabled = self .settings .autotune_checkpoint_dir is not None
12051247 if not (checkpoint_enabled and self ._try_load_checkpoint ()):
12061248 self ._init_search ()
1249+ self ._load_crashed_configs ()
12071250 try :
12081251 best = self ._autotune ()
12091252 if checkpoint_enabled :
0 commit comments