Tighten DPP eligibility to require exact thread-count match and use arch DB for wave size

stefankoncarevic · stefankoncarevic · commit aac9312bd4be · 2026-05-05T08:24:56.000-04:00
- Change canUseDPP condition from &gt;= to == for blockSize vs
  clusterSize * nonReductionDimSizeProduct to prevent potential
  out-of-bounds LDS writes by extra threads when blockSize exceeds
  the exact thread count needed for the DPP layout.
- Replace hard-coded chipset major version heuristic in
  SubgroupReduceToDPP with rock::lookupArchInfo(chip).waveSize
  for more robust subgroup size derivation.
- Update lowering_blockwise_broadcast_reduce test to use dimensions
  where blockSize == clusterSize * nrDimProd (8 == 2 * 4).
diff --git a/mlir/lib/Dialect/Rock/Transforms/BlockwiseGemmToThreadwise.cpp b/mlir/lib/Dialect/Rock/Transforms/BlockwiseGemmToThreadwise.cpp
@@ -1360,15 +1360,16 @@ struct BlockwiseReduceRewritePattern
         // 2. More than 1 reduction thread (at least 2 for cross-lane work)
         // 3. partial_r > 2 (DPP overhead not justified for partial_r=2)
         // 4. Reduction threads fit within a single wave
-        // 5. Block has enough threads or non-reduction dim is trivial
+        // 5. blockSize == clusterSize * nonReductionDimSizeProduct, or
+        //    nonReductionDimSizeProduct == 1.
         // Otherwise, fall back to LDS-based tree reduction.
         int64_t maxActiveReductionThreads = threadViewShape[rTidDim];
         int64_t clusterSize = llvm::PowerOf2Ceil(maxActiveReductionThreads);
         int64_t partialR = partialRegTensorShape[rDim];
         bool canUseDPP = llvm::isPowerOf2_64(maxActiveReductionThreads) &&
                          (maxActiveReductionThreads > 1) && (partialR > 2) &&
                          (maxActiveReductionThreads <= waveSize) &&
-                         (blockSize >= maxActiveReductionThreads *
+                         (blockSize == maxActiveReductionThreads *
                                            nonReductionDimSizeProduct ||
                           nonReductionDimSizeProduct == 1);
         // DPP path: contiguous threads reduce together (rtid = tid % cluster).
diff --git a/mlir/lib/Dialect/Rock/Transforms/SubgroupReduceToDPP.cpp b/mlir/lib/Dialect/Rock/Transforms/SubgroupReduceToDPP.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/Rock/IR/AmdArchDb.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -51,10 +52,7 @@ struct RockSubgroupReduceToDPPPass
     MLIRContext *ctx = &getContext();
     RewritePatternSet patterns(ctx);
 
-    unsigned subgroupSize = 64;
-    if (maybeChipset->majorVersion >= 10) {
-      subgroupSize = 32;
-    }
+    unsigned subgroupSize = rock::lookupArchInfo(chip).waveSize;
 
     populateGpuBreakDownSubgroupReducePatterns(
         patterns, /*maxShuffleBitwidth=*/32, PatternBenefit(3));
diff --git a/mlir/test/Dialect/Rock/lowering_blockwise_broadcast_reduce.mlir b/mlir/test/Dialect/Rock/lowering_blockwise_broadcast_reduce.mlir
@@ -73,26 +73,27 @@ func.func @rock_blockwise_reducesum_nr_threads_gt_blocksize(%input_reg : memref<
 
 // -----
 
-#inputView = #rock.transform_map<affine_map<(d0, d1) -> (d1, d0)> by [<PassThrough ["tid"] at [0] -> ["r"] at [1]>, <PassThrough ["iter"] at [1] -> ["nr_per_bid"] at [0]>] bounds = [10, 3] -> [3, 10]>
-#inputView_tid = #rock.transform_map<affine_map<(d0) -> (0, d0)> by [<Merge{1, 10} ["tid"] at [0] -> ["nr_per_bid", "r"] at [0, 1]>] bounds = [10] -> [1, 10]>
-#inputView_iter = #rock.transform_map<affine_map<(d0) -> (d0, 0)> by [<Merge{3, 1} ["iter"] at [0] -> ["nr_per_bid", "r"] at [0, 1]>] bounds = [3] -> [3, 1]>
+#inputView = #rock.transform_map<affine_map<(d0, d1) -> (d1, d0)> by [<PassThrough ["tid"] at [0] -> ["r"] at [1]>, <PassThrough ["iter"] at [1] -> ["nr_per_bid"] at [0]>] bounds = [8, 4] -> [4, 8]>
+#inputView_tid = #rock.transform_map<affine_map<(d0) -> (0, d0)> by [<Merge{1, 8} ["tid"] at [0] -> ["nr_per_bid", "r"] at [0, 1]>] bounds = [8] -> [1, 8]>
+#inputView_iter = #rock.transform_map<affine_map<(d0) -> (d0, 0)> by [<Merge{4, 1} ["iter"] at [0] -> ["nr_per_bid", "r"] at [0, 1]>] bounds = [4] -> [4, 1]>
 // CHECK-LABEL: func @rock_blockwise_reducesum_rthreads_fix
-func.func @rock_blockwise_reducesum_rthreads_fix(%input_reg : memref<3xf32, #gpu.address_space<private>>, %output_reg : memref<3xf32, #gpu.address_space<private>>, %ws_lds : memref<30xf32, #gpu.address_space<workgroup>>) attributes{rock.arch = "##TOKEN_ARCH##", block_size = 10 : i32, grid_size = 2 : i32, rock.kernel} {
+func.func @rock_blockwise_reducesum_rthreads_fix(%input_reg : memref<4xf32, #gpu.address_space<private>>, %output_reg : memref<4xf32, #gpu.address_space<private>>, %ws_lds : memref<32xf32, #gpu.address_space<workgroup>>) attributes{rock.arch = "##TOKEN_ARCH##", block_size = 8 : i32, grid_size = 2 : i32, rock.kernel} {
     // Compute rthread index and nr index from tid
+    // blockSize=8, nrDimProd=4, rTid=2, cs=2 -> cs*nrDimProd=8==blockSize
     // CHECK-DAG: %[[TID:.*]] = rock.workitem_id : index
     // CHECK:     %[[RTID:.*]] = arith.andi %[[TID]], %c1
     // CHECK:     %[[NRTID:.*]] = arith.shrui %[[TID]], %c1
 
-    // Threadwise partial reduction uses rDimPerRThread=5
+    // Threadwise partial reduction uses rDimPerRThread=4
     // CHECK: rock.transforming_for
-    // CHECK-SAME: bounds [1, 1, 5]
+    // CHECK-SAME: bounds [1, 1, 4]
     // DPP subgroup reduce replaces tree reduction
     // CHECK: gpu.subgroup_reduce add {{.*}} cluster(size = 2)
     // CHECK: arith.cmpi eq, %[[RTID]], %c0
     // CHECK: scf.if
     // CHECK: rock.lds_barrier
     // CHECK: rock.threadwise_read_into
-  rock.blockwise_broadcast_reduce sum [#inputView][#inputView_tid][#inputView_iter]%input_reg into %output_reg using %ws_lds {axis = 1 : index, blockSize = 10 : i32, nrDimPerThread = 3 : index} : memref<3xf32, #gpu.address_space<private>> using memref<30xf32, #gpu.address_space<workgroup>> into memref<3xf32, #gpu.address_space<private>>
+  rock.blockwise_broadcast_reduce sum [#inputView][#inputView_tid][#inputView_iter]%input_reg into %output_reg using %ws_lds {axis = 1 : index, blockSize = 8 : i32, nrDimPerThread = 4 : index} : memref<4xf32, #gpu.address_space<private>> using memref<32xf32, #gpu.address_space<workgroup>> into memref<4xf32, #gpu.address_space<private>>
   return
 }