[Pallas] Use exact RDIM size instead of next-power-of-2

norx1991 · norx1991 · commit 3b39bfb56c1a · 2026-04-05T22:01:25.000-07:00
Pallas block refs use exact tensor dimensions, so rounding RDIM_SIZE to the next power of 2 (e.g., 1000→1024) causes shape mismatches. Add Backend.static_rdim_size() and override it in PallasBackend to return the exact size. Also override next_power_of_2_host_expr to be a no-op for Pallas. Removes @xfailIfPallas from test_reduce_non_pow2 added in #1945.
diff --git a/helion/_compiler/backend.py b/helion/_compiler/backend.py
@@ -284,6 +284,12 @@ def next_power_of_2_host_expr(self, expr: str) -> str:
         """Generate a host-side next-power-of-2 expression."""
         raise exc.BackendUnsupported(self.name, "next_power_of_2")
 
+    def static_rdim_size(self, numel: int) -> int:
+        """Return the RDIM block size for a statically known reduction dimension."""
+        from torch._inductor.runtime.runtime_utils import next_power_of_2
+
+        return next_power_of_2(numel)
+
     def reduction_combine_expr(
         self,
         reduction_type: str,
@@ -1121,6 +1127,15 @@ def reduction_index_expr(
     def reduction_index_zero_expr(self, dtype: str) -> str:
         return f"jnp.zeros([0], dtype={dtype})"
 
+    def next_power_of_2_host_expr(self, expr: str) -> str:
+        # Pallas block refs already have the exact tensor dimension size,
+        # so RDIM_SIZE must match the actual dimension (no power-of-2 rounding).
+        # Rounding up would create index arrays larger than the block ref.
+        return expr
+
+    def static_rdim_size(self, numel: int) -> int:
+        return numel
+
     def adjust_block_size_constraints(
         self,
         block_specs: list[object],
diff --git a/helion/_compiler/reduction_strategy.py b/helion/_compiler/reduction_strategy.py
@@ -407,7 +407,7 @@ def codegen_preamble(self, state: CodegenState) -> None:
             if isinstance(numel, sympy.Integer):
                 # Static size - issue statement immediately
                 stmt = statement_from_string(
-                    f"{block_size_var} = {next_power_of_2(int(numel))}"
+                    f"{block_size_var} = {backend.static_rdim_size(int(numel))}"
                 )
                 state.codegen.host_statements.append(stmt)
             else:
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -11,7 +11,6 @@
 from helion._testing import code_and_output
 from helion._testing import onlyBackends
 from helion._testing import skipUnlessPallas
-from helion._testing import xfailIfPallas
 import helion.language as hl
 
 
@@ -540,7 +539,6 @@ def test_emit_pipeline_loop_order(self) -> None:
         expected = (x.float() @ y.float() + bias.float()).to(torch.bfloat16)
         torch.testing.assert_close(result, expected, rtol=1e-2, atol=1e-2)
 
-    @xfailIfPallas("RDIM_SIZE rounded to next power of 2 causes shape mismatch")
     def test_reduce_non_pow2(self) -> None:
         """Reduction over non-power-of-2 dim should use exact size, not rounded."""
         x = torch.randn(128, 1000, device=DEVICE, dtype=torch.float32)

Original file line number	Diff line number	Diff line change
`@@ -407,7 +407,7 @@ def codegen_preamble(self, state: CodegenState) -> None:`
`407`	`407`	`if isinstance(numel, sympy.Integer):`
`408`	`408`	`# Static size - issue statement immediately`
`409`	`409`	`stmt = statement_from_string(`
`410`		`- f"{block_size_var} = {next_power_of_2(int(numel))}"`
	`410`	`+ f"{block_size_var} = {backend.static_rdim_size(int(numel))}"`
`411`	`411`	`)`
`412`	`412`	`state.codegen.host_statements.append(stmt)`
`413`	`413`	`else:`