Add epilogue subtiling example and tuple output support in run_example

choijon5 · choijon5 · commit bafff2ac52bd · 2026-04-01T09:37:30.000Z
diff --git a/examples/epilogue_subtiling.py b/examples/epilogue_subtiling.py
@@ -0,0 +1,146 @@
+"""
+Epilogue Subtiling Example
+==========================
+
+This example demonstrates matmul kernels with heavy epilogues that benefit from
+epilogue subtiling on Blackwell (sm_100+).  Epilogue subtiling splits the store
+from ``[BLOCK_M, BLOCK_N]`` into ``SUBTILE_FACTOR x [BLOCK_M, BLOCK_N / SUBTILE_FACTOR]``,
+halving the accumulator shared-memory footprint and enabling an extra pipeline stage.
+"""
+
+# %%
+# Imports
+# -------
+
+# %%
+from __future__ import annotations
+
+import torch
+
+import helion
+from helion._testing import DEVICE
+from helion._testing import HALF_DTYPE
+from helion._testing import run_example
+import helion.language as hl
+
+# %%
+# Kernel 1 -- Matmul + Residual + Bias + GELU + Cast
+# ---------------------------------------------------
+# CUTLASS-style residual + bias + GELU forward epilogue with two
+# fp32 reads (residual, bias) fused into the output tile.
+
+
+# %%
+@helion.kernel(static_shapes=True)
+def matmul_bias_residual_gelu_cast(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    bias: torch.Tensor,
+    residual: torch.Tensor,
+) -> torch.Tensor:
+    m, k = x.size()
+    _, n = w.size()
+    out = torch.empty([m, n], dtype=torch.float16, device=x.device)
+
+    for tile_m, tile_n in hl.tile([m, n]):
+        acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+        for tile_k in hl.tile(k):
+            acc = torch.addmm(acc, x[tile_m, tile_k], w[tile_k, tile_n])
+
+        val = acc * 1.25
+        val = val + residual[tile_m, tile_n].to(torch.float32) * 0.5
+        val = val + bias[tile_n]
+        val = torch.nn.functional.gelu(val)
+        out[tile_m, tile_n] = val.to(torch.float16)
+
+    return out
+
+
+# %%
+# Kernel 2 -- Matmul + Bias + GELU with Auxiliary Output
+# ------------------------------------------------------
+# cuBLASLt / CUTLASS-style GELU+AUX forward epilogue that writes both
+# the pre-activation (aux) and post-GELU (out) tensors.
+
+
+# %%
+@helion.kernel(static_shapes=True)
+def matmul_bias_gelu_aux(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    bias: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    m, k = x.size()
+    _, n = w.size()
+    out = torch.empty([m, n], dtype=torch.float16, device=x.device)
+    aux = torch.empty([m, n], dtype=torch.float16, device=x.device)
+
+    for tile_m, tile_n in hl.tile([m, n]):
+        acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+        for tile_k in hl.tile(k):
+            acc = torch.addmm(acc, x[tile_m, tile_k], w[tile_k, tile_n])
+
+        pre = acc * 1.25
+        pre = pre + bias[tile_n]
+        aux[tile_m, tile_n] = pre.to(torch.float16)
+        out[tile_m, tile_n] = torch.nn.functional.gelu(pre).to(torch.float16)
+
+    return out, aux
+
+
+# %%
+# Verification
+# ------------
+
+
+# %%
+def check(m: int, k: int, n: int) -> None:
+    x = torch.randn([m, k], device=DEVICE, dtype=HALF_DTYPE)
+    w = torch.randn([k, n], device=DEVICE, dtype=HALF_DTYPE)
+    bias = torch.randn([n], device=DEVICE, dtype=HALF_DTYPE)
+    residual = torch.randn([m, n], device=DEVICE, dtype=HALF_DTYPE)
+
+    def baseline_residual_gelu(
+        x: torch.Tensor,
+        w: torch.Tensor,
+        bias: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> torch.Tensor:
+        acc = x.float() @ w.float()
+        val = acc * 1.25 + residual.float() * 0.5 + bias.float()
+        return torch.nn.functional.gelu(val).half()
+
+    run_example(
+        matmul_bias_residual_gelu_cast,
+        baseline_residual_gelu,
+        (x, w, bias, residual),
+    )
+
+    def baseline_gelu_aux(
+        x: torch.Tensor,
+        w: torch.Tensor,
+        bias: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        acc = x.float() @ w.float()
+        pre = acc * 1.25 + bias.float()
+        return torch.nn.functional.gelu(pre).half(), pre.half()
+
+    run_example(
+        matmul_bias_gelu_aux,
+        baseline_gelu_aux,  # pyrefly: ignore[bad-argument-type]
+        (x, w, bias),
+    )
+
+
+# %%
+# Main
+# ----
+
+
+# %%
+def main() -> None:
+    check(8192, 8192, 8192)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -325,13 +325,6 @@ def allocate_store_index(self) -> int:
         self.device_memory_op_index += 1
         return idx
 
-    def allocate_store_index(self) -> int:
-        """Bump store counters and return the indexing strategy slot."""
-        self.device_store_index += 1
-        idx = self.device_memory_op_index
-        self.device_memory_op_index += 1
-        return idx
-
     def get_indexing_strategy(self, index: int) -> IndexingStrategy:
         from .indexing_strategy import IndexingStrategy
         from .indexing_strategy import PointerIndexingStrategy
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -937,6 +937,14 @@ def code_and_output(
     return code, result
 
 
+def _as_tensors(result: object) -> list[torch.Tensor]:
+    """Normalize a single tensor or tuple of tensors to a flat list."""
+    if isinstance(result, tuple):
+        return [t.clone() for t in result]
+    assert isinstance(result, torch.Tensor)
+    return [result.clone()]
+
+
 def run_example(
     kernel_fn: Callable[..., torch.Tensor] | Kernel | dict[str, Kernel],
     baseline_fn: Callable[..., torch.Tensor] | dict[str, Callable[..., torch.Tensor]],
@@ -975,20 +983,21 @@ def run_example(
 
     # Check correctness against first baseline
     first_baseline_name, first_baseline_func = next(iter(baselines.items()))
-    expected = first_baseline_func(*args).clone()
+    expected = _as_tensors(first_baseline_func(*args))
 
     for name, func in {**kernels, **baselines}.items():
         if name != first_baseline_name:
             print(f"Testing {name} correctness...", file=sys.stderr)
-            # Clone args to avoid buffer donation issues (e.g., Pallas/TPU)
             cloned_args = _clone_args(args)
-            result = func(*cloned_args).clone()
-            torch.testing.assert_close(
-                result.to(torch.float32),
-                expected.to(torch.float32),
-                rtol=rtol,
-                atol=atol,
-            )
+            result = _as_tensors(func(*cloned_args))
+            assert len(result) == len(expected)
+            for r, e in zip(result, expected, strict=True):
+                torch.testing.assert_close(
+                    r.to(torch.float32),
+                    e.to(torch.float32),
+                    rtol=rtol,
+                    atol=atol,
+                )
 
     # Test backward pass
     if bwd: