dbg

shunting314 · shunting314 · commit 55a9afbdaf3f · 2026-04-01T15:37:06.000-07:00
stack-info: PR: #1911, branch: shunting314/stack/30
diff --git a/examples/distributed/fp8_matmul_reduce_scatter.py b/examples/distributed/fp8_matmul_reduce_scatter.py
@@ -21,9 +21,11 @@
 import torch.distributed._symmetric_memory as symm_mem
 
 import helion
-from helion.autotuner.base_search import _assert_close as assert_close_with_mismatch_tolerance
 from helion._testing import DEVICE
 from helion._testing import run_example
+from helion.autotuner.base_search import (
+    _assert_close as assert_close_with_mismatch_tolerance,
+)
 import helion.language as hl
 from helion.runtime.dist_utils import symm_mem_sync
 
@@ -33,13 +35,17 @@
     max_mismatch_pct=1e-3,
 )
 
+config = helion.Config(
+    block_sizes=[64, 64, 32],  # M, N, K
+    num_warps=8,
+    num_stages=3,
+)
+
+# config = helion.Config(block_sizes=[64, 128, 128], indexing=['pointer', 'pointer', 'pointer', 'tensor_descriptor', 'pointer', 'pointer', 'pointer', 'pointer', 'tensor_descriptor', 'pointer', 'pointer', 'pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[1], load_eviction_policies=['last', '', '', '', '', 'first', '', '', '', '', '', 'last'], loop_orders=[[0, 1]], num_sm_multiplier=2, num_stages=1, num_warps=8, pid_type='persistent_blocked', range_flattens=[None, None], range_multi_buffers=[True, None], range_unroll_factors=[3, 0], range_warp_specializes=[])
+
 
 @helion.kernel(
-    config=helion.Config(
-        block_sizes=[64, 64, 32],  # M, N, K
-        num_warps=8,
-        num_stages=3,
-    ),
+    config=config,
     static_shapes=True,
     ignore_warnings=[helion.exc.TensorOperationInWrapper],
     autotune_baseline_accuracy_check_fn=functools.partial(
@@ -82,8 +88,10 @@ def fp8_matmul_reduce_scatter_kernel(
             acc = hl.dot(a[tile_m, tile_k], b[tile_k, tile_n], acc=acc)
 
         # Apply per-row and per-column scales
-        acc = acc * scale_a[tile_m, :].to(torch.float32) * scale_b[:, tile_n].to(
-            torch.float32
+        acc = (
+            acc
+            * scale_a[tile_m, :].to(torch.float32)
+            * scale_b[:, tile_n].to(torch.float32)
         )
 
         # Store bfloat16 partial result to this rank's symmetric-memory buffer
@@ -165,7 +173,9 @@ def reference_fp8_matmul_reduce_scatter(
     if group is None:
         raise RuntimeError("Distributed group is not initialized")
 
-    c = torch._scaled_mm(a, b, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16)
+    c = torch._scaled_mm(
+        a, b, scale_a=scale_a, scale_b=scale_b, out_dtype=torch.bfloat16
+    )
 
     world_size = dist.get_world_size(group)
     M_scatter = c.shape[0] // world_size
@@ -223,7 +233,10 @@ def test(M: int, N: int, K: int, device: torch.device) -> None:
 
     run_example(
         functools.partial(helion_fp8_matmul_reduce_scatter, symm_mem_buffer),
-        {"nccl+cublas": reference_fp8_matmul_reduce_scatter, "fused_baseline": reference_fused_scaled_matmul_reduce_scatter},
+        {
+            "nccl+cublas": reference_fp8_matmul_reduce_scatter,
+            "fused_baseline": reference_fused_scaled_matmul_reduce_scatter,
+        },
         (a, b, scale_a, scale_b),
         **tolerance,
     )
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -34,8 +34,10 @@
 from ._utils import counters
 from .runtime.settings import _get_backend
 from .runtime.settings import is_pallas_interpret
+from helion.autotuner.base_search import (
+    _assert_close as assert_close_with_mismatch_tolerance,
+)
 from helion.autotuner.base_search import _clone_args
-from helion.autotuner.base_search import _assert_close as assert_close_with_mismatch_tolerance
 
 if _get_backend() == "pallas":
     from .autotuner.benchmarking import compute_repeat_generic as compute_repeat
@@ -1541,4 +1543,3 @@ def capture_output(self) -> Generator[_OutputCapture, None, None]:
             yield capture
         finally:
             sys.stdout, sys.stderr = old_stdout, old_stderr
-
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -635,8 +635,9 @@ def _validate_against_baseline(
             custom_check = self.settings.autotune_baseline_accuracy_check_fn
             if custom_check is not None:
                 custom_check(output, self._baseline_output)
-                if len(self._mutated_arg_indices) > 0:
-                    custom_check(args, self._baseline_post_args)
+                if os.getenv("CHECK_INPUT_ACCURACY", "1") == "1":
+                    if len(self._mutated_arg_indices) > 0:
+                        custom_check(args, self._baseline_post_args)
             else:
                 _assert_close(
                     output,
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -655,6 +655,7 @@ class Settings(_Settings):
             "If True, allow torch.compile to fuse this Helion kernel with surrounding Inductor ops "
             "(prologue/epilogue) when used inside torch.compile. Default False. "
             "Set HELION_TORCH_COMPILE_FUSION=1 to enable globally."
+        ),
         "config_filter": (
             "Optional callable ``(config: Config) -> bool`` that the autotuner calls on every "
             "candidate config before compiling or benchmarking it.  Configs for which the "