custom config filter

shunting314 · shunting314 · commit 98ab5e0d9e87 · 2026-04-02T10:56:46.000-07:00
stack-info: PR: #1847, branch: shunting314/stack/25
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -206,7 +206,7 @@ class BenchmarkResult(NamedTuple):
     config: Config
     fn: Callable[..., object]
     perf: float
-    status: Literal["ok", "error", "timeout", "peer_compilation_fail"]
+    status: Literal["ok", "error", "timeout", "peer_compilation_fail", "filtered"]
     compile_time: float | None
 
 
@@ -913,6 +913,33 @@ def _benchmark(
             A list of BenchmarkResult entries containing the configuration, compiled
             callable, measured performance, status, and compilation time.
         """
+        config_filter = self.settings.config_filter
+        if config_filter is not None:
+            passing_indices = [i for i, c in enumerate(configs) if config_filter(c)]
+            if len(passing_indices) < len(configs):
+                passing_configs = [configs[i] for i in passing_indices]
+                inner_results = self._benchmark(passing_configs, desc=desc)
+                inner_iter = iter(inner_results)
+                merged: list[BenchmarkResult] = []
+                passing_set = set(passing_indices)
+                for i, config in enumerate(configs):
+                    if i in passing_set:
+                        merged.append(next(inner_iter))
+                    else:
+                        self.log.debug(
+                            f"Config filtered out by config_filter: {config!r}"
+                        )
+                        merged.append(
+                            BenchmarkResult(
+                                config=config,
+                                fn=lambda *a, **kw: None,
+                                perf=inf,
+                                status="filtered",
+                                compile_time=None,
+                            )
+                        )
+                return merged
+
         fns: list[Callable[..., object]] = []
         valid_configs: list[Config] = []
         futures: list[PrecompileFuture] | None = None
@@ -976,7 +1003,9 @@ def _benchmark(
                 )
             else:
                 compile_time = None
-            status: Literal["ok", "error", "timeout", "peer_compilation_fail"]
+            status: Literal[
+                "ok", "error", "timeout", "peer_compilation_fail", "filtered"
+            ]
             if all(
                 all_gather_object(
                     is_working, process_group_name=self.kernel.env.process_group_name
@@ -1174,9 +1203,9 @@ class PopulationMember:
     perfs: list[float]
     flat_values: FlatConfig
     config: Config
-    status: Literal["ok", "error", "timeout", "peer_compilation_fail", "unknown"] = (
-        "unknown"
-    )
+    status: Literal[
+        "ok", "error", "timeout", "peer_compilation_fail", "filtered", "unknown"
+    ] = "unknown"
     compile_time: float | None = None
 
     @property
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -29,6 +29,7 @@
 if TYPE_CHECKING:
     from ..autotuner.base_search import BaseAutotuner
     from ..autotuner.pattern_search import InitialPopulationStrategy
+    from .config import Config
     from .kernel import BoundKernel
 
     _T = TypeVar("_T")
@@ -513,6 +514,7 @@ class _Settings:
             _env_get_bool, "HELION_AUTOTUNE_WITH_TORCH_COMPILE_FUSION", False
         )
     )
+    config_filter: Callable[[Config], bool] | None = None
 
 
 class Settings(_Settings):
@@ -658,6 +660,12 @@ class Settings(_Settings):
             "If True, allow torch.compile to fuse this Helion kernel with surrounding Inductor ops "
             "(prologue/epilogue) when used inside torch.compile. Default False. "
             "Set HELION_TORCH_COMPILE_FUSION=1 to enable globally."
+        "config_filter": (
+            "Optional callable ``(config: Config) -> bool`` that the autotuner calls on every "
+            "candidate config before compiling or benchmarking it.  Configs for which the "
+            "callable returns False are skipped entirely (no compilation, no benchmarking). "
+            "Also filters the explicit ``configs=[...]`` list when one is provided. "
+            "Pass as @helion.kernel(..., config_filter=my_filter_fn)."
         ),
         "autotune_with_torch_compile_fusion": (
             "If True, autotuning benchmarks the fused kernel (with epilogue/prologue) "
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -2225,5 +2225,92 @@ def test_autotune_cache_invalid_raises(self):
                     bound.settings.autotuner_fn(bound, args)
 
 
+@onlyBackends(["triton"])
+class TestConfigFilter(TestCase):
+    """Tests for the config_filter setting."""
+
+    def _make_kernel_and_args(self, **kernel_kwargs):
+        @helion.kernel(autotune_log_level=0, **kernel_kwargs)
+        def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            out = torch.empty_like(a)
+            for tile in hl.tile(out.size()):
+                out[tile] = a[tile] + b[tile]
+            return out
+
+        args = (
+            torch.randn([128], device=DEVICE),
+            torch.randn([128], device=DEVICE),
+        )
+        return add, args
+
+    def test_config_filter_skips_filtered_configs(self) -> None:
+        """Filtered configs produce status='filtered' and perf=inf."""
+        cfg1 = helion.Config(block_sizes=[16], num_warps=4)
+        cfg2 = helion.Config(block_sizes=[32], num_warps=4)
+        cfg3 = helion.Config(block_sizes=[64], num_warps=4)
+
+        filtered_out: list[helion.Config] = []
+
+        def my_filter(config: helion.Config) -> bool:
+            if config.get("block_sizes") == [32]:
+                filtered_out.append(config)
+                return False
+            return True
+
+        add, args = self._make_kernel_and_args(
+            config_filter=my_filter, autotune_precompile=None
+        )
+        bound = add.bind(args)
+        search = FiniteSearch(bound, args, configs=[cfg1, cfg2, cfg3])
+        search._prepare()
+        results = search.benchmark_batch([cfg1, cfg2, cfg3])
+
+        # cfg2 should be filtered
+        self.assertEqual(len(filtered_out), 1)
+        self.assertEqual(filtered_out[0].get("block_sizes"), [32])
+
+        statuses = {tuple(r.config.get("block_sizes", [])): r.status for r in results}
+        self.assertEqual(statuses[(16,)], "ok")
+        self.assertEqual(statuses[(32,)], "filtered")
+        self.assertEqual(statuses[(64,)], "ok")
+
+        perfs = {tuple(r.config.get("block_sizes", [])): r.perf for r in results}
+        self.assertEqual(perfs[(32,)], float("inf"))
+
+    def test_config_filter_affects_autotune_winner(self) -> None:
+        """The autotuner never picks a filtered config as the winner."""
+        # cfg_fast would normally win (smallest block = least work per kernel launch
+        # in this trivial test), but we filter it out.
+        cfg_fast = helion.Config(block_sizes=[16], num_warps=4)
+        cfg_slow = helion.Config(block_sizes=[128], num_warps=4)
+
+        def reject_small_blocks(config: helion.Config) -> bool:
+            return (config.get("block_sizes") or [0])[0] >= 64
+
+        add, args = self._make_kernel_and_args(config_filter=reject_small_blocks)
+        bound = add.bind(args)
+        search = FiniteSearch(bound, args, configs=[cfg_fast, cfg_slow])
+        winner = search.autotune()
+        # cfg_fast is filtered out, so cfg_slow must win
+        self.assertEqual(winner.get("block_sizes"), [128])
+
+    def test_config_filter_none_is_noop(self) -> None:
+        """When config_filter=None (default), all configs are benchmarked normally."""
+        cfg1 = helion.Config(block_sizes=[16], num_warps=4)
+        cfg2 = helion.Config(block_sizes=[32], num_warps=4)
+
+        add, args = self._make_kernel_and_args(
+            autotune_precompile=None
+        )  # no config_filter
+        bound = add.bind(args)
+        search = FiniteSearch(bound, args, configs=[cfg1, cfg2])
+        search._prepare()
+        results = search.benchmark_batch([cfg1, cfg2])
+
+        for result in results:
+            self.assertNotEqual(result.status, "filtered")
+            self.assertFalse(math.isinf(result.perf))
+
+
 if __name__ == "__main__":
     unittest.main()