scaled matmul reduce scatter

shunting314 · shunting314 · commit 1d684afbf68a · 2026-03-27T11:31:46.000-07:00
diff --git a/examples/distributed/fp8_matmul_reduce_scatter.py b/examples/distributed/fp8_matmul_reduce_scatter.py
@@ -0,0 +1,196 @@
+"""
+FP8 MatMul + Reduce-Scatter Fusion Example
+==========================================
+This example extends the matmul_reduce_scatter example to use FP8 inputs.
+Each rank holds FP8 A and B shards; the kernel computes a local FP8 GEMM
+(accumulating in FP32 via ``hl.dot``), writes the float16 partial result to
+a symmetric-memory buffer, performs an intra-group barrier, and then
+reduce-scatters: each rank accumulates the rows it owns from all peers'
+buffers, producing a ``[M//WORLD_SIZE, N]`` float16 output.
+"""
+
+from __future__ import annotations
+
+import functools
+import os
+
+import torch
+from torch._C._distributed_c10d import _SymmetricMemory
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+
+import helion
+from helion._testing import DEVICE
+from helion._testing import run_example
+import helion.language as hl
+from helion.runtime.dist_utils import symm_mem_sync
+
+
+@helion.kernel(
+    config=helion.Config(
+        block_sizes=[64, 64, 32],  # M, N, K
+        num_warps=8,
+        num_stages=3,
+    ),
+    static_shapes=True,
+    ignore_warnings=[helion.exc.TensorOperationInWrapper],
+)
+def fp8_matmul_reduce_scatter_kernel(
+    a: torch.Tensor,  # [M, K] float8_e4m3fn
+    b: torch.Tensor,  # [K, N] float8_e4m3fn
+    symm_mem_buffer: torch.Tensor,  # [M, N] float16, symmetric memory
+    signal_pad_ptrs: torch.Tensor,
+    RANK: hl.constexpr,
+    WORLD_SIZE: hl.constexpr,
+    GROUP_NAME: hl.ProcessGroupName,
+) -> torch.Tensor:
+    """
+    Fused FP8 MatMul + Reduce-Scatter kernel.
+
+    Computes ``(A @ B).to(float16)`` in a distributed reduce-scatter pattern:
+    each rank emits only its ``M // WORLD_SIZE`` output rows.
+    """
+    M, K = a.size()
+    K2, N = b.size()
+    M_scatter = M // WORLD_SIZE  # type: ignore[unsupported-operation]
+
+    output = torch.empty([M_scatter, N], dtype=torch.float16, device=a.device)
+
+    buffer_tuple = torch.ops.symm_mem.get_remote_tensors(symm_mem_buffer, GROUP_NAME)
+
+    scatter_start = RANK * M_scatter  # type: ignore[unsupported-operation]
+    scatter_end = scatter_start + M_scatter  # type: ignore[unsupported-operation]
+
+    for tile_m, tile_n in hl.tile([M, N]):
+        # FP8 GEMM tile, accumulating in FP32
+        acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+        for tile_k in hl.tile(K):
+            acc = hl.dot(a[tile_m, tile_k], b[tile_k, tile_n], acc=acc)
+
+        # Store float16 partial result to this rank's symmetric-memory buffer
+        symm_mem_buffer[tile_m, tile_n] = acc.to(torch.float16)
+
+        # Barrier: release our write, acquire peers' writes
+        hl.triton_kernel(
+            symm_mem_sync,
+            args=(signal_pad_ptrs, None, RANK, WORLD_SIZE, True, True),
+            output_like=None,
+        )
+
+        # Reduce-scatter: accumulate only the rows this rank owns
+        if tile_m.begin >= scatter_start and tile_m.begin < scatter_end:  # type: ignore[unsupported-operation]
+            acc_reduce = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+            for remote_buffer in buffer_tuple:
+                acc_reduce = acc_reduce + remote_buffer[tile_m, tile_n].to(
+                    torch.float32
+                )
+            output[tile_m.index - scatter_start, tile_n] = acc_reduce.to(torch.float16)  # type: ignore[unsupported-operation]
+
+        # Final barrier (release only)
+        hl.triton_kernel(
+            symm_mem_sync,
+            args=(signal_pad_ptrs, None, RANK, WORLD_SIZE, True, False),
+            output_like=None,
+        )
+
+    return output
+
+
+def helion_fp8_matmul_reduce_scatter(
+    symm_mem_buffer: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Wrapper that rendezvouss on the pre-allocated symmetric buffer and
+    invokes the FP8 reduce-scatter kernel.
+
+    Args:
+        symm_mem_buffer: Pre-allocated symmetric-memory buffer ``[M, N]`` float16.
+        a: Local FP8 A shard ``[M, K]`` (``torch.float8_e4m3fn``).
+        b: Local FP8 B shard ``[K, N]`` (``torch.float8_e4m3fn``).
+    """
+    group = dist.group.WORLD
+    if group is None:
+        raise RuntimeError("Distributed group is not initialized")
+
+    symm_mem_hdl = symm_mem.rendezvous(symm_mem_buffer, group.group_name)
+
+    return fp8_matmul_reduce_scatter_kernel(
+        a,
+        b,
+        symm_mem_buffer,
+        symm_mem_hdl.signal_pad_ptrs_dev,
+        RANK=symm_mem_hdl.rank,
+        WORLD_SIZE=symm_mem_hdl.world_size,
+        GROUP_NAME=group.group_name,
+    )
+
+
+def reference_fp8_matmul_reduce_scatter(
+    a: torch.Tensor,
+    b: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Reference: dequantize to float32, matmul, reduce-scatter along M.
+    """
+    group = dist.group.WORLD
+    if group is None:
+        raise RuntimeError("Distributed group is not initialized")
+
+    c = torch.mm(a.to(torch.float32), b.to(torch.float32)).to(torch.float16)
+
+    world_size = dist.get_world_size(group)
+    M_scatter = c.shape[0] // world_size
+    output = torch.empty(M_scatter, c.shape[1], dtype=c.dtype, device=c.device)
+    dist.reduce_scatter_tensor(output, c, group=group)
+    return output
+
+
+def test(M: int, N: int, K: int, device: torch.device) -> None:
+    """Test the FP8 reduce-scatter kernel against the reference."""
+    rank = dist.get_rank()
+
+    torch.manual_seed(42 + rank)
+    a_fp32 = torch.randn(M, K, device=device)
+    a = a_fp32.to(torch.float8_e4m3fn)
+
+    torch.manual_seed(42)
+    b_fp32 = torch.randn(K, N, device=device)
+    b = b_fp32.to(torch.float8_e4m3fn)
+
+    symm_mem_buffer = symm_mem.empty(M, N, dtype=torch.float16, device=device)
+    symm_mem.rendezvous(symm_mem_buffer, dist.group.WORLD.group_name)  # type: ignore[union-attr]
+
+    run_example(
+        functools.partial(helion_fp8_matmul_reduce_scatter, symm_mem_buffer),
+        reference_fp8_matmul_reduce_scatter,
+        (a, b),
+        rtol=2e-1,
+        atol=2e-1,
+    )
+
+
+def main() -> None:
+    _SymmetricMemory.signal_pad_size = 1024 * 1024 * 16
+    rank = int(os.environ["LOCAL_RANK"])
+    torch.manual_seed(42 + rank)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    dist.init_process_group("nccl")
+
+    test(M=512, N=768, K=1024, device=device)
+
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    """
+    Run with:
+    python -m torch.distributed.run --standalone \\
+    --nproc-per-node 4 \\
+    --rdzv-backend c10d --rdzv-endpoint localhost:0 \\
+    examples/distributed/fp8_matmul_reduce_scatter.py
+    """
+    assert DEVICE.type == "cuda", "Requires CUDA device"
+    main()
diff --git a/test/test_distributed.py b/test/test_distributed.py
@@ -426,6 +426,45 @@ def do_test_matmul_reduce_scatter(self, kernel, ref_kernel):
 
         torch.testing.assert_close(result, expected, rtol=1e-1, atol=1e-1)
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
+    @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
+    @skip_if_lt_x_gpu(4)
+    def test_fp8_matmul_reduce_scatter(self):
+        if not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9:
+            self.skipTest("FP8 requires CUDA compute capability >= 9.0")
+        self._init_process()
+
+        mod = import_path(EXAMPLES_DIR / "distributed" / "fp8_matmul_reduce_scatter.py")
+
+        _SymmetricMemory.signal_pad_size = 1024 * 1024 * 16
+        M, N, K = 512, 768, 1024
+
+        torch.manual_seed(42 + self.rank)
+        a_fp32 = torch.randn(M, K, device=self.device)
+        a = a_fp32.to(torch.float8_e4m3fn)
+
+        torch.manual_seed(42)
+        b_fp32 = torch.randn(K, N, device=self.device)
+        b = b_fp32.to(torch.float8_e4m3fn)
+
+        symm_mem_buffer = symm_mem.empty(M, N, dtype=torch.float16, device=self.device)
+        symm_mem_hdl = symm_mem.rendezvous(symm_mem_buffer, dist.group.WORLD.group_name)
+
+        result = mod.fp8_matmul_reduce_scatter_kernel(
+            a,
+            b,
+            symm_mem_buffer,
+            symm_mem_hdl.signal_pad_ptrs_dev,
+            RANK=symm_mem_hdl.rank,
+            WORLD_SIZE=symm_mem_hdl.world_size,
+            GROUP_NAME=dist.group.WORLD.group_name,
+        )
+
+        expected = mod.reference_fp8_matmul_reduce_scatter(a, b)
+
+        torch.testing.assert_close(result, expected, rtol=2e-1, atol=2e-1)
+        self._cleanup_process()
+
     @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)