Tighten the rock.lds_transpose_load ODS result-type constraint to the

stefankoncarevic · stefankoncarevic · commit 519a2115da13 · 2026-05-04T08:58:22.000-05:00
exact set of valid vector types and drop the now-redundant verifier
checks. Add ldsTransposeConfig structural checks to
ThreadwiseReadIntoOp::verify (rank-1 + static dest, supported element
type, (geometry, type) consistency). Replace the assert + .value() in
emitThreadwiseHWTranspose with emitOpError to avoid UB in release
builds and reject non-rank-1 / dynamic destinations up-front. Use
AmdArchInfo::hasLdsTransposeLoad for arch gating, share a single
isValidLdsTransposeMfmaGeometry helper, align the numWaves formula
with computeWaveGridLayout, drop the dead  (tuning only
emits power-of-2 wave-tile factors), and refresh doc comments. Add
four negative ODS-coverage tests for the result-type constraint.
diff --git a/mlir/include/mlir/Dialect/Rock/IR/RockOps.td b/mlir/include/mlir/Dialect/Rock/IR/RockOps.td
@@ -1232,7 +1232,9 @@ def Rock_LDSTransposeLoadOp
       Arguments<(ins Arg<MemRefOf<[F16, BF16, F8E4M3FN, F8E5M2]>,
                          "LDS source buffer">:$source,
           Variadic<Index>:$indices)>,
-      Results<(outs AnyVectorOfNonZeroRank:$result)> {
+      Results<(outs AnyTypeOf<
+          [VectorOfLengthAndType<[4], [F16, BF16]>,
+           VectorOfLengthAndType<[8], [F8E4M3FN, F8E5M2]>]>:$result)> {
   let summary =
       "Hardware-assisted LDS transpose load for matrix accelerator tile";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/Rock/utility/LdsTransposeLoad.h b/mlir/include/mlir/Dialect/Rock/utility/LdsTransposeLoad.h
@@ -49,6 +49,18 @@ namespace mlir::rock::hwtranspose {
 // Operand selector (A or B matrix)
 enum class OperandKind { A, B };
 
+// Returns true if the given (D, K) MFMA geometry is one of the geometries
+// recognized by the LDS transpose load lowering.
+// Recognized combinations:
+//   Standard:   (16,16), (16,32), (32,8), (32,16)
+//   Scaled FP8: (16,128) quad-rate, (32,64) quad-rate
+// Note: this is geometry-only recognition. Element-type compatibility
+// (e.g., FP8-only quad-rate) is enforced separately by the caller.
+inline bool isValidLdsTransposeMfmaGeometry(int64_t dDim, int64_t kDim) {
+  return (dDim == 16 && (kDim == 16 || kDim == 32 || kDim == 128)) ||
+         (dDim == 32 && (kDim == 8 || kDim == 16 || kDim == 64));
+}
+
 // Build LDS transpose config attribute from already-computed MFMA params.
 // Used in BlockwiseLoadTileToThreadwise when decision was made upstream.
 // Requires mfmaDDim > 0 and mfmaKDim > 0 (asserted).
diff --git a/mlir/lib/Dialect/Rock/IR/RockDialect.cpp b/mlir/lib/Dialect/Rock/IR/RockDialect.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/Rock/IR/AccelEmitter.h"
 #include "mlir/Dialect/Rock/IR/AmdArchDb.h"
 #include "mlir/Dialect/Rock/IR/GetRockInfo.h"
+#include "mlir/Dialect/Rock/utility/LdsTransposeLoad.h"
 #include "mlir/Dialect/Rock/utility/transformMapUtils.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
@@ -558,22 +559,14 @@ LogicalResult TransformMapAttr::verify(
   return success();
 }
 
-// Helper function to check valid MFMA geometry for LDS transpose
-static bool isValidLdsTransposeMfmaGeometry(int64_t dDim, int64_t kDim) {
-  // Supported geometries:
-  // Standard: (16,16), (16,32), (32,8), (32,16)
-  // Scaled FP8: (16,128), (32,64)
-  return (dDim == 16 && (kDim == 16 || kDim == 32 || kDim == 128)) ||
-         (dDim == 32 && (kDim == 8 || kDim == 16 || kDim == 64));
-}
-
 LogicalResult LDSTransposeConfigAttr::verify(
     function_ref<InFlightDiagnostic()> emitError, int64_t dDim, int64_t kDim,
     int64_t mPerBlock, int64_t nPerBlock, int64_t kPerBlock, int64_t mPerWave,
     int64_t nPerWave, bool doubleBuffering, bool isOperandA) {
 
-  // Validate MFMA geometry
-  if (!isValidLdsTransposeMfmaGeometry(dDim, kDim)) {
+  // Validate MFMA geometry (geometry-only recognition; type-aware checks
+  // are enforced by the lowering decision in LdsTransposeLoad.cpp).
+  if (!hwtranspose::isValidLdsTransposeMfmaGeometry(dDim, kDim)) {
     return emitError()
            << "invalid MFMA geometry (" << dDim << "x" << kDim
            << ") for LDS transpose - valid combinations: "
@@ -2154,42 +2147,17 @@ LogicalResult LDSTransposeLoadOp::verify() {
   if (!memSpaceCheck.value())
     return emitOpError("source memory address space must be workgroup (LDS)");
 
-  // Result element type must match source element type
+  // Result element type must match source element type. ODS guarantees the
+  // result is one of the allowed vector types, so cast<VectorType> is safe.
   Type srcElemType = srcType.getElementType();
-  VectorType resultType = getResult().getType();
-  Type resultElemType = resultType.getElementType();
-
+  Type resultElemType =
+      cast<VectorType>(getResult().getType()).getElementType();
   if (resultElemType != srcElemType) {
     return emitOpError("result element type (")
            << resultElemType << ") must match source element type ("
            << srcElemType << ")";
   }
 
-  if (resultType.getRank() != 1)
-    return emitOpError("expected 1-D result vector, but got rank ")
-           << resultType.getRank();
-
-  // Verify result vector length based on element type:
-  // - 16-bit types (f16, bf16): ds_read_tr16_b64 returns 4 elements
-  // - 8-bit types (f8E4M3FN, f8E5M2 - OCP FP8 for gfx950): ds_read_tr8_b64
-  // returns 8 elements
-  int64_t expectedVecLen;
-  if (srcElemType.isF16() || srcElemType.isBF16()) {
-    expectedVecLen = 4;
-  } else if (isa<Float8E4M3FNType>(srcElemType) ||
-             isa<Float8E5M2Type>(srcElemType)) {
-    expectedVecLen = 8;
-  } else {
-    return emitOpError("unsupported element type for LDS transpose load: ")
-           << srcElemType;
-  }
-
-  if (resultType.getNumElements() != expectedVecLen) {
-    return emitOpError("expected result vector of ")
-           << expectedVecLen << " elements for " << srcElemType
-           << " type, but got " << resultType.getNumElements();
-  }
-
   // Check hardware support using AmdArchDb
   StringRef arch = rock::getArchValue(*this);
   AmdArchInfo archInfo = rock::lookupArchInfo(arch);
@@ -2373,6 +2341,31 @@ LogicalResult ThreadwiseReadIntoOp::verify() {
           "in register-to-register reads produced by input fusion");
     }
   }
+
+  // Structural checks for the LDS transpose load fast path.
+  if (LDSTransposeConfigAttr cfg = getLdsTransposeConfigAttr()) {
+    if (destType.getRank() != 1 || destType.isDynamicDim(0))
+      return emitOpError("ldsTransposeConfig requires a rank-1 destination "
+                         "with a static shape");
+    Type destElemType = destType.getElementType();
+    bool isFp8 = isa<Float8E4M3FNType, Float8E5M2Type>(destElemType);
+    bool is16Bit = destElemType.isF16() || destElemType.isBF16();
+    if (!is16Bit && !isFp8)
+      return emitOpError("ldsTransposeConfig only supports f16, bf16, "
+                         "f8E4M3FN, or f8E5M2 destination element types");
+    int64_t dDim = cfg.getDDim();
+    int64_t kDim = cfg.getKDim();
+    bool isQuadRateGeometry =
+        (dDim == 16 && kDim == 128) || (dDim == 32 && kDim == 64);
+    bool isF16OnlyGeometry =
+        (dDim == 16 && kDim == 16) || (dDim == 32 && kDim == 8);
+    if (isFp8 && isF16OnlyGeometry)
+      return emitOpError("MFMA geometry (")
+             << dDim << "x" << kDim << ") is not supported for FP8/BF8";
+    if (is16Bit && isQuadRateGeometry)
+      return emitOpError("quad-rate MFMA geometry (")
+             << dDim << "x" << kDim << ") is only valid for FP8/BF8";
+  }
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Rock/utility/LdsTransposeLoad.cpp b/mlir/lib/Dialect/Rock/utility/LdsTransposeLoad.cpp
@@ -50,8 +50,6 @@ using namespace mlir::rock;
 namespace mlir::rock::hwtranspose {
 namespace {
 
-bool archSupported(StringRef arch) { return arch.contains("gfx950"); }
-
 // Check if element type is supported for LDS transpose load
 // - f16, bf16: ds_read_tr16_b64 (4 elements)
 // - f8E4M3FN, f8E5M2 (OCP FP8 for gfx950): ds_read_tr8_b64 (8 elements)
@@ -78,15 +76,6 @@ static int64_t getTransposeLoadVectorLength(Type elemType) {
   llvm_unreachable("Unsupported element type for LDS transpose load");
 }
 
-// Validates MFMA geometry for LDS transpose support.
-// Supported combinations:
-// Standard: (16,16), (16,32), (32,8), (32,16)
-// Scaled FP8: (16,128) quad-rate, (32,64) quad-rate
-static bool isValidMfmaGeometry(int64_t dDim, int64_t kDim) {
-  return (dDim == 16 && (kDim == 16 || kDim == 32 || kDim == 128)) ||
-         (dDim == 32 && (kDim == 8 || kDim == 16 || kDim == 64));
-}
-
 // Shape of a single MFMA instruction (internal use only).
 struct MfmaInstrShape {
   int64_t mnMfma;
@@ -140,8 +129,10 @@ static Decision makeDecision(StringRef arch, Type elemTypeA, Type elemTypeB,
   dec.nPerWave = nPerWave;
   dec.doubleBuffering = doubleBuffering;
 
-  // Basic applicability checks
-  if (!archSupported(arch) || !DirectToLds) {
+  // Basic applicability checks. Use the arch DB as the single source of truth
+  // for which architectures support ds_read_tr* (kept consistent with the
+  // verifier in RockDialect.cpp via AmdArchInfo::hasLdsTransposeLoad).
+  if (!rock::lookupArchInfo(arch).hasLdsTransposeLoad || !DirectToLds) {
     return dec;
   }
 
@@ -165,10 +156,28 @@ static Decision makeDecision(StringRef arch, Type elemTypeA, Type elemTypeB,
     return dec;
 
   // Validate MFMA geometry
-  if (!isValidMfmaGeometry(shape.mnMfma, shape.kMfma)) {
+  if (!isValidLdsTransposeMfmaGeometry(shape.mnMfma, shape.kMfma)) {
     return dec;
   }
 
+  // Reject geometry/type combinations not handled in getBasePanelOffsets:
+  //   - F16/BF16 path supports: (16,16), (16,32), (32,8), (32,16)
+  //   - FP8/BF8  path supports: (16,32), (32,16), (16,128), (32,64)
+  // Mismatched pairs would hit llvm_unreachable in getBasePanelOffsets.
+  // typesCompatible() above already guarantees A and B are either identical
+  // or both FP8/BF8 variants, so checking elemTypeA is sufficient.
+  bool isQuadRateGeometry = (shape.mnMfma == 16 && shape.kMfma == 128) ||
+                            (shape.mnMfma == 32 && shape.kMfma == 64);
+  bool isF16OnlyGeometry = (shape.mnMfma == 16 && shape.kMfma == 16) ||
+                           (shape.mnMfma == 32 && shape.kMfma == 8);
+  if (isFp8Type(elemTypeA)) {
+    if (isF16OnlyGeometry)
+      return dec;
+  } else {
+    if (isQuadRateGeometry)
+      return dec;
+  }
+
   if (!validatePaneling(shape, operand, mPerBlock, nPerBlock, kPerBlock)) {
     return dec;
   }
@@ -325,9 +334,9 @@ LDSTransposeDecision decideLDSTransposeForOperands(
   }
   // else - neither operand usable, enableA/enableB remain false.
 
-  // Check if numWaves is supported (1, 2, 3, 4, 8, 16)
-  // TODO: support 32 waves for WMMA
-  int64_t numWaves = (mPerBlock * nPerBlock) / (mPerWave * nPerWave);
+  // Check if numWaves is supported (1, 2, 4, 8, 16).
+  // TODO: support 32 waves for WMMA.
+  int64_t numWaves = (mPerBlock / mPerWave) * (nPerBlock / nPerWave);
   if (numWaves > 16) {
     result.enableA = false;
     result.enableB = false;
@@ -346,7 +355,7 @@ LDSTransposeConfigAttr buildTransposeAttrFromParams(
   // INVARIANT: MFMA geometry must be valid
   assert(mfmaDDim > 0 && mfmaKDim > 0 &&
          "MFMA geometry must be set when building transpose attributes");
-  assert(isValidMfmaGeometry(mfmaDDim, mfmaKDim) &&
+  assert(isValidLdsTransposeMfmaGeometry(mfmaDDim, mfmaKDim) &&
          "Invalid MFMA geometry for LDS transpose - valid: (16,16), (16,32), "
          "(16,128), (32,8), (32,16), (32,64)");
 
@@ -582,14 +591,17 @@ static Value emitPanelLoad(PatternRewriter &b, Location loc, Value rawSrc,
 //===----------------------------------------------------------------------===//
 // writePanelVectorsToDestination - Write loaded panel vectors to destination
 //
-// Extracts individual f16/bf16 elements from loaded panel vectors and writes
-// them sequentially to the destination buffer. Each panel vector contains 4
-// elements (ds_read_tr16_b64 always returns vector<4xf16>).
+// Extracts individual elements from loaded panel vectors and writes them
+// sequentially to the destination buffer. Panel vector width depends on the
+// element type:
+//   - f16/bf16:  vector<4>  (ds_read_tr16_b64)
+//   - fp8/bf8:   vector<8>  (ds_read_tr8_b64)
 //
 // Parameters:
 //   panelVectors - Array of loaded panel vectors (vector<4> for f16/bf16,
-//   vector<8> for fp8/bf8) dest     - Destination memref (rank-1, scalar
-//   layout) targetElems  - Maximum number of elements to write
+//                  vector<8> for fp8/bf8)
+//   dest         - Destination memref (rank-1, scalar layout)
+//   targetElems  - Maximum number of elements to write
 //
 // Returns:
 //   success() if all target elements were written
@@ -655,13 +667,17 @@ writePanelVectorsToDestination(PatternRewriter &b, Location loc,
 //===----------------------------------------------------------------------===//
 // getBasePanelOffsets - Compute per-panel LDS offsets for a given lane ID
 //
-// Given a wavefront lane ID and a specific MFMA layout (L16x32, L16x16, etc.),
-// this function computes the base byte offsets into LDS memory where each
-// lane should read its operands from.
+// Given a wavefront lane ID and a specific MFMA layout, this function computes
+// the base byte offsets into LDS memory where each lane should read its
+// operands from. These offsets are derived from AMD's LDS tiling and MFMA
+// operand layout conventions, mapping each lane's register to the correct
+// element position in LDS.
 //
-// These offsets are derived from AMD's LDS tiling and MFMA operand layout
-// conventions (e.g., 16x16, 16x32 panels). The goal is to map each lane's
-// register to the correct element position in LDS.
+// Supported (dDim, kDim) combinations per element type:
+//   F16 / BF16:  (16,16), (16,32), (32,8), (32,16)   -- ds_read_tr16_b64
+//   FP8 / BF8:   (16,32), (32,16), (16,128), (32,64) -- ds_read_tr8_b64
+// Any other (type, geometry) combination triggers llvm_unreachable. Callers
+// must validate the (type, geometry) pair upstream (see makeDecision()).
 //
 // Note: This is an internal helper function. Use computeLDSBaseOffsets()
 // instead for better readability.
@@ -809,7 +825,7 @@ static SmallVector<Value> getBasePanelOffsets(PatternRewriter &b, Location loc,
 //
 // Parameters:
 //   dDim - MFMA D dimension (M or N, 16 or 32)
-//   kDim - MFMA K dimension (8, 16, or 32)
+//   kDim - MFMA K dimension (8, 16, 32, 64, or 128)
 //   lane - Thread's lane ID within the workgroup
 //   elemType - Element type (f16, bf16, fp8, or bf8) for selecting lane mapping
 //
@@ -841,17 +857,20 @@ static std::pair<Value, Value> computeLDSBaseOffsets(PatternRewriter &b,
 // dimensions, and decomposes the wave ID into a 2D grid position.
 //
 // This version uses a deterministic layout selection based solely on the number
-// of physical waves (1, 2, 3, or 4). The goal is to match the wave grid to the
-// number of available wave tiles (waveTilesInM, waveTilesInN) while choosing a
-// stable and predictable layout.
+// of physical waves. The goal is to match the wave grid to the number of
+// available wave tiles (waveTilesInM, waveTilesInN) while choosing a stable
+// and predictable layout.
 //
 // Key principles:
-//  - physicalWaves ∈ {1, 2, 3, 4} (corresponding to 64–256 threads)
+//  - physicalWaves ∈ {1, 2, 4, 8, 16}. Tuning generates only power-of-2
+//    wave-tile factors (see computeDPerWave's `factor *= 2` step), so
+//    numWaves is always a product of two power-of-2 values.
 //  - Prefer balanced or natural layouts when possible:
 //        1 wave  → 1×1
 //        2 waves → prefer 1×2
-//        3 waves → prefer 1×3
 //        4 waves → prefer 2×2
+//        8 waves → prefer 2×4
+//       16 waves → prefer 4×4
 //  - If a preferred layout does not fit the available tiles, fallback logic
 //    selects the best possible layout while maintaining determinism.
 //  - The result defines which spatial tile each wave is responsible for,
@@ -908,21 +927,6 @@ computeWaveGridLayout(PatternRewriter &b, Location loc, Value waveId,
     }
     break;
 
-  case 3:
-    // Three waves: prefer 1×3, fallback to 3×1 or dimension-based
-    if (waveTilesInN >= 3) {
-      wavesInM = 1;
-      wavesInN = 3;
-    } else if (waveTilesInM >= 3) {
-      wavesInM = 3;
-      wavesInN = 1;
-    } else {
-      // Fallback: choose dimension with more tiles (outer loop handles rest)
-      wavesInM = (waveTilesInN >= waveTilesInM) ? 1 : 3;
-      wavesInN = (waveTilesInN >= waveTilesInM) ? 3 : 1;
-    }
-    break;
-
   case 4:
     // Four waves: prefer 2×2 (balanced), then 1×4, 4×1, or fallback
     if (waveTilesInM >= 2 && waveTilesInN >= 2) {
@@ -1286,12 +1290,13 @@ LogicalResult emitThreadwiseHWTranspose(PatternRewriter &b,
   OperandKind operand =
       config.getIsOperandA() ? OperandKind::A : OperandKind::B;
 
-  // Compute wave grid layout and decompose wave ID into 2D position
+  // Compute wave grid layout and decompose wave ID into 2D position.
   FailureOr<WaveGridLayout> maybeWaveGrid = computeWaveGridLayout(
       b, loc, waveId, mPerWave, nPerWave, mPerBlock, nPerBlock);
-  assert(succeeded(maybeWaveGrid) &&
-         "If we decided to use transpose load, this must work");
-  WaveGridLayout waveGrid = maybeWaveGrid.value();
+  if (failed(maybeWaveGrid))
+    return op.emitOpError(
+        "unsupported wave grid layout for LDS transpose load");
+  WaveGridLayout waveGrid = *maybeWaveGrid;
   Value waveM = waveGrid.waveM;
   Value waveN = waveGrid.waveN;
 
@@ -1450,8 +1455,13 @@ LogicalResult emitThreadwiseHWTranspose(PatternRewriter &b,
            << expectedLoads << ", got " << panelVectors.size();
   }
 
-  // Write loaded panel vectors to destination buffer
-  // Destination is rank-1 with scalar sequential layout
+  // Write loaded panel vectors to destination buffer.
+  // Destination must be rank-1 with a static shape; we cannot statically
+  // size the writes otherwise.
+  if (destType.getRank() != 1 || destType.isDynamicDim(0)) {
+    return op.emitOpError(
+        "LDS transpose load destination must be rank-1 with a static shape");
+  }
   int64_t destCap = destType.getShape()[0];
   int64_t targetElems = std::min<int64_t>(sliceElems, destCap);
 
diff --git a/mlir/test/Dialect/Rock/ops_error.mlir b/mlir/test/Dialect/Rock/ops_error.mlir