dbg

shunting314 · shunting314 · commit d25f8ad8d286 · 2026-04-02T11:33:20.000-07:00
stack-info: PR: #1911, branch: shunting314/stack/30
diff --git a/examples/distributed/fp8_matmul_reduce_scatter.py b/examples/distributed/fp8_matmul_reduce_scatter.py
@@ -24,6 +24,9 @@
 from helion._testing import DEVICE
 from helion._testing import assert_close_with_mismatch_tolerance
 from helion._testing import run_example
+from helion.autotuner.base_search import (
+    _assert_close as assert_close_with_mismatch_tolerance,
+)
 import helion.language as hl
 from helion.runtime.dist_utils import symm_mem_sync
 
@@ -33,13 +36,17 @@
     "max_mismatch_pct": 1e-3,
 }
 
+config = helion.Config(
+    block_sizes=[64, 64, 32],  # M, N, K
+    num_warps=8,
+    num_stages=3,
+)
+
+# config = helion.Config(block_sizes=[64, 128, 128], indexing=['pointer', 'pointer', 'pointer', 'tensor_descriptor', 'pointer', 'pointer', 'pointer', 'pointer', 'tensor_descriptor', 'pointer', 'pointer', 'pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[1], load_eviction_policies=['last', '', '', '', '', 'first', '', '', '', '', '', 'last'], loop_orders=[[0, 1]], num_sm_multiplier=2, num_stages=1, num_warps=8, pid_type='persistent_blocked', range_flattens=[None, None], range_multi_buffers=[True, None], range_unroll_factors=[3, 0], range_warp_specializes=[])
+
 
 @helion.kernel(
-    config=helion.Config(
-        block_sizes=[64, 64, 32],  # M, N, K
-        num_warps=8,
-        num_stages=3,
-    ),
+    config=config,
     static_shapes=True,
     ignore_warnings=[helion.exc.TensorOperationInWrapper],
     autotune_baseline_accuracy_check_fn=functools.partial(
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -34,8 +34,10 @@
 from ._utils import counters
 from .runtime.settings import _get_backend
 from .runtime.settings import is_pallas_interpret
+from helion.autotuner.base_search import (
+    _assert_close as assert_close_with_mismatch_tolerance,
+)
 from helion.autotuner.base_search import _clone_args
-from helion.autotuner.base_search import _assert_close as assert_close_with_mismatch_tolerance
 
 if _get_backend() == "pallas":
     from .autotuner.benchmarking import compute_repeat_generic as compute_repeat
@@ -1541,4 +1543,3 @@ def capture_output(self) -> Generator[_OutputCapture, None, None]:
             yield capture
         finally:
             sys.stdout, sys.stderr = old_stdout, old_stderr
-
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -635,8 +635,9 @@ def _validate_against_baseline(
             custom_check = self.settings.autotune_baseline_accuracy_check_fn
             if custom_check is not None:
                 custom_check(output, self._baseline_output)
-                if len(self._mutated_arg_indices) > 0:
-                    custom_check(args, self._baseline_post_args)
+                if os.getenv("CHECK_INPUT_ACCURACY", "1") == "1":
+                    if len(self._mutated_arg_indices) > 0:
+                        custom_check(args, self._baseline_post_args)
             else:
                 _assert_close(
                     output,
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -660,6 +660,7 @@ class Settings(_Settings):
             "If True, allow torch.compile to fuse this Helion kernel with surrounding Inductor ops "
             "(prologue/epilogue) when used inside torch.compile. Default False. "
             "Set HELION_TORCH_COMPILE_FUSION=1 to enable globally."
+        ),
         "config_filter": (
             "Optional callable ``(config: Config) -> bool`` that the autotuner calls on every "
             "candidate config before compiling or benchmarking it.  Configs for which the "