[MPS] fix compiling of SDPA producing nan results (#178009)

pytorchbot · Isalia20 · web-flow · commit db741c720978 · 2026-03-20T23:04:26.000-04:00
[MPS] fix compiling of SDPA producing nan results (#175481) Fixes #171764 Took me a while to figure out wth was going wrong. Mini reproducer: ```python import torch # (uint / 65536) % non_power of 2, gives wrong result lib = torch.mps.compile_shader(''' kernel void func(device int* out, uint idx [[thread_position_in_grid]]) { out[idx] = (idx / 65536) % 6; } ''') out = torch.empty(128, device='mps', dtype=torch.int32) lib.func(out) # Every value should be 0 since xindex/65536 == 0 for xindex in [0,127] for i in [0, 5, 6, 7, 63, 64]: print(f"{i=} got {out[i].item()}") ``` Same purely in swift ```swift import Metal let device = MTLCreateSystemDefaultDevice()! let queue = device.makeCommandQueue()! let shaderSource = """ kernel void func(device int* out [[buffer(0)]], uint idx [[thread_position_in_grid]]) { out[idx] = (idx / 65536) % 6; } """ let library = try device.makeLibrary(source: shaderSource, options: nil) let function = library.makeFunction(name: "func")! let pipeline = try device.makeComputePipelineState(function: function) let count = 128 let buffer = device.makeBuffer(length: count * MemoryLayout<Int32>.stride, options: .storageModeShared)! let cmdBuf = queue.makeCommandBuffer()! let encoder = cmdBuf.makeComputeCommandEncoder()! encoder.setComputePipelineState(pipeline) encoder.setBuffer(buffer, offset: 0, index: 0) encoder.dispatchThreads( MTLSizeMake(count, 1, 1), threadsPerThreadgroup: MTLSizeMake(min(count, pipeline.maxTotalThreadsPerThreadgroup), 1, 1) ) encoder.endEncoding() cmdBuf.commit() cmdBuf.waitUntilCompleted() let ptr = buffer.contents().bindMemory(to: Int32.self, capacity: count) for i in [0, 5, 6, 7, 63, 64] { print("i=\(i) got \(ptr[i])") } ``` Pull Request resolved: #175481 Approved by: https://github.com/malfet (cherry picked from commit 3a9554c) Co-authored-by: Isalia20 <irakli.salia854@gmail.com>
diff --git a/c10/metal/utils.h b/c10/metal/utils.h
@@ -189,6 +189,18 @@ inline common_dtype<T, U> floor_divide(T x, U y) {
   return ::metal::floor(x / y);
 }
 
+// Workaround for Metal compiler bug: the compiler produces wrong results
+// when optimizing fused (x / A) % B expressions for integral types.
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> safe_mod(volatile T x, U y) {
+  return x % y;
+}
+
 // fmod
 template <
     typename T,
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
@@ -191,6 +191,25 @@ def fn(a, b):
             ),
         )
 
+    def test_sdpa_split_qkv(self):
+        # regression test for metal compiler bug where fused (x / A) % B
+        # produces wrong results, causing incorrect reads from non-contiguous.
+        n_head, n_embd, seq_len = 6, 384, 1024
+        x = torch.randn(16, seq_len, n_embd, device="mps")
+        c_attn = torch.nn.Linear(n_embd, 3 * n_embd).to("mps").eval()
+        qkv = c_attn(x)
+        q, k, v = qkv.split(n_embd, dim=2)
+        q = q.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2)
+        k = k.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2)
+        v = v.view(16, seq_len, n_head, n_embd // n_head).transpose(1, 2)
+
+        def fn(q, k, v):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, is_causal=True
+            )
+
+        self.common(fn, (q, k, v), atol=1e-4, rtol=1e-4, check_lowp=False)
+
 
 class MPSBasicTestsAOTI(TestCase):
     def check_model(self, m, inp, dynamic_shapes=None):
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -13313,6 +13313,22 @@ def test_metal_error_buffer(self):
         with self.assertRaisesRegex(RuntimeError, "Index .* exceeds limit"):
             torch.mps.synchronize()
 
+    def test_metal_compiler_bug_workaround(self):
+        # (uint / 65536) % non_power of 2, gives wrong result without safe_mod
+        lib = torch.mps.compile_shader('''
+            #include <c10/metal/utils.h>
+
+            kernel void func(device int* out, uint idx [[thread_position_in_grid]]) {
+                out[idx] = c10::metal::safe_mod((idx / 65536), 6);
+            }
+        ''')
+        out = torch.empty(128, device='mps', dtype=torch.int32)
+        lib.func(out)
+        # Every value should be 0 since xindex/65536 == 0 for xindex in [0,127]
+        for i in [0, 5, 6, 7, 63, 64]:
+            self.assertEqual(out[i], 0)
+
+
 
 # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing.
 # This requires mps to be properly registered in the device generic test framework which is not the
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
@@ -80,6 +80,9 @@ def _print_FloorDiv(self, expr: sympy.Expr) -> str:
 
     def _print_ModularIndexing(self, expr: sympy.Expr) -> str:
         x, div, mod = expr.args
+        # Workaround for Metal compiler bug with fused (x / A) % B, see PR 175481
+        use_safe_mod = div == 65536 and (mod & (mod - 1)) != 0
+
         x = self.doprint(x)
         if div != 1:
             div = self.doprint(div)
@@ -88,6 +91,8 @@ def _print_ModularIndexing(self, expr: sympy.Expr) -> str:
             else:
                 x = f"metal::floor({x}) / ({div})"
         mod = self.doprint(mod)
+        if use_safe_mod:
+            return f"c10::metal::safe_mod({x}, {mod})"
         return f"({x}) % ({mod})"
 
     def _print_Min(self, expr: sympy.Expr) -> str: