microsoft
diff --git a/‎lib/AnalysisStructured/PtrAnalysis.cpp‎
Lines changed: 104 additions & 39 deletions b/‎lib/AnalysisStructured/PtrAnalysis.cpp‎
Lines changed: 104 additions & 39 deletions
diff --git a/‎lib/Conversion/StructuredToMemref/StructuredToMemref.cpp‎
Lines changed: 23 additions & 1 deletion b/‎lib/Conversion/StructuredToMemref/StructuredToMemref.cpp‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎python/examples/test_index_select.py‎
Lines changed: 1 addition & 16 deletions b/‎python/examples/test_index_select.py‎
Lines changed: 1 addition & 16 deletions
diff --git a/‎test/Conversion/StructuredToMemref/gather_scatter_ptr_to_linalg.mlir‎
Lines changed: 20 additions & 20 deletions b/‎test/Conversion/StructuredToMemref/gather_scatter_ptr_to_linalg.mlir‎
Lines changed: 20 additions & 20 deletions
@@ -145,13 +145,16 @@ LogicalResult PtrState::rebuildAsUnsupportedOp(Value operand) {
   // Setup state for unsupported operation.
   auto indexTy = IndexType::get(operand.getContext());
   auto index0 = IntegerAttr::get(indexTy, APInt(64, 0));
+  auto index1 = IntegerAttr::get(indexTy, APInt(64, 1));
   for (auto size : opShape) {
-    if (size == 1)
+    if (size == 1) {
       offsets.push_back(index0);
-    else
+      strides.push_back(index0);
+    } else {
       offsets.push_back(operand);
+      strides.push_back(index1);
+    }
     sizes.push_back(IntegerAttr::get(indexTy, APInt(64, size)));
-    strides.push_back(index0);
     shape.push_back(index0);
   }
   return success();
@@ -174,9 +177,10 @@ LogicalResult PtrState::rebuildAsGatherScatter(Value op, int nonContinuousDim) {
   // Setup state for nonContinuousDim.
   auto indexTy = IndexType::get(op.getContext());
   auto index0 = IntegerAttr::get(indexTy, APInt(64, 0));
+  auto index1 = IntegerAttr::get(indexTy, APInt(64, 1));
 
   offsets[nonContinuousDim] = op;
-  strides[nonContinuousDim] = index0;
+  strides[nonContinuousDim] = index1;
   shape[nonContinuousDim] = index0;
   return success();
 }
@@ -222,43 +226,105 @@ LogicalResult PtrState::addState(const PtrState &lhsState,
           addOFRs(lhsState.strides[i], rhsState.strides[i], loc, builder);
       strides.push_back(newStride);
     } else {
-      // Set stride to 1 when not continuous.
-      strides.push_back(builder.getIndexAttr(1));
-      // New offset is offset * stride.
-      auto newLhsOffset = lhsState.offsets[i];
-      auto newRhsOffset = rhsState.offsets[i];
       if (isAnalysisingUnstructured) {
         assert(!lhsState.hasModulo() && !rhsState.hasModulo() &&
                "should not have dimension with modulo when analysing "
                "unstructured");
-        // When the dimension is structured, mul the offset by the stride to
-        // match the stride 1 for non-structured dimensions.
-        // If the dimension is not structured, the offset is already multiplied
-        // by the stride.
-        // If stride is 0 which will happen after
-        // visitOperandExpandDims/visitOperandSplat, we cannot mul which will
-        // get zero and lost the offset.
-        if (lhsState.dimIsStructured(i) && !hasConstZero(lhsState.strides[i])) {
-          auto stride = expandOFRIndex(lhsState.strides[i], lhsState.offsets[i],
-                                       loc, builder);
-          newLhsOffset = mulOFRs(lhsState.offsets[i], stride, loc, builder);
-        }
-        if (rhsState.dimIsStructured(i) && !hasConstZero(rhsState.strides[i])) {
-          auto stride = expandOFRIndex(rhsState.strides[i], rhsState.offsets[i],
-                                       loc, builder);
-          newRhsOffset = mulOFRs(rhsState.offsets[i], stride, loc, builder);
-        }
-        // Make sure newLhsOffset and newRhsOffset get same type.
-        if (!lhsState.dimIsStructured(i)) {
-          newRhsOffset =
-              expandOFRIndex(newRhsOffset, newLhsOffset, loc, builder);
+        if (hasConstZero(lhsState.strides[i]) &&
+            hasConstZero(lhsState.offsets[i])) {
+          // If lhs is not for dim i, we can just use rhs's stride and offset.
+          offsets.push_back(rhsState.offsets[i]);
+          strides.push_back(rhsState.strides[i]);
+        } else if (hasConstZero(rhsState.strides[i]) &&
+                   hasConstZero(rhsState.offsets[i])) {
+          // If rhs is not for dim i, we can just use lhs's stride and offset.
+          offsets.push_back(lhsState.offsets[i]);
+          strides.push_back(lhsState.strides[i]);
         } else {
-          newLhsOffset =
-              expandOFRIndex(newLhsOffset, newRhsOffset, loc, builder);
+          OpFoldResult lhsOffset = lhsState.offsets[i];
+          OpFoldResult rhsOffset = rhsState.offsets[i];
+          OpFoldResult lhsStride = lhsState.strides[i];
+          OpFoldResult rhsStride = rhsState.strides[i];
+          // If stride is 0 which will happen after
+          // visitOperandExpandDims/visitOperandSplat, we set the stride to 1 to
+          // mul it with offset.
+          if (hasConstZero(lhsStride)) {
+            assert(lhsState.dimIsStructured(i) &&
+                   !rhsState.dimIsStructured(i) &&
+                   "If lhs stride is zero, it must be structured and rhs "
+                   "stride is unstructured");
+            lhsStride = builder.getIndexAttr(1);
+          }
+          if (hasConstZero(rhsStride)) {
+            assert(rhsState.dimIsStructured(i) &&
+                   !lhsState.dimIsStructured(i) &&
+                   "If rhs stride is zero, it must be structured and lhs "
+                   "stride is unstructured");
+            rhsStride = builder.getIndexAttr(1);
+          }
+
+          // If both offset and stride not equal, we merge 2 PtrStates by change
+          // offset * stride into (offset * stride) * 1 where new offset is
+          // offset * stride and new stride is set to 1.
+          // Then we'll have strides equal as 1, and merge them as PtrState with
+          // same strides.
+          if (lhsOffset != rhsOffset && lhsStride != rhsStride) {
+            // Expand offset since unstructured offset has tensor type.
+            OpFoldResult stride =
+                expandOFRIndex(lhsStride, lhsOffset, loc, builder);
+            // new offset = offset * stride
+            lhsOffset = mulOFRs(lhsOffset, stride, loc, builder);
+            // Expand offset since unstructured offset has tensor type.
+            stride = expandOFRIndex(rhsStride, rhsOffset, loc, builder);
+            // new offset = offset * stride
+            rhsOffset = mulOFRs(rhsOffset, stride, loc, builder);
+            // Set both strides to 1.
+            lhsStride = builder.getIndexAttr(1);
+            rhsStride = builder.getIndexAttr(1);
+          }
+
+          if (lhsStride == rhsStride) {
+            // For case like lhs_offset * stride + rhs_offset * stride, it is same as
+            // (lhs_offset + rhs_offset) * stride.
+            // We can just
+            // add the offsets and reuse the stride like this:
+            //   offsets[i] = lhsOffset + rhsOffset
+            //   strides[i] = lhsStride
+            // Expand structured offset since unstructured offset has tensor type.
+            if (!lhsState.dimIsStructured(i)) {
+              rhsOffset = expandOFRIndex(rhsOffset, lhsOffset, loc, builder);
+            } else {
+              lhsOffset = expandOFRIndex(lhsOffset, rhsOffset, loc, builder);
+            }
+            // Add offsets.
+            offsets.push_back(addOFRs(lhsOffset, rhsOffset, loc, builder));
+            // Reuse stride.
+            strides.push_back(lhsStride);
+          } else {
+            // Assert that offsets are equal if strides are not equal.
+            // This is because we are already forcing the strides to be
+            // equal to 1 earlier for case both offsets and strides not equal.
+            assert(lhsOffset == rhsOffset &&
+                   "If strides are not equal, offsets must be equal");
+            // For case like offset * lhs_stride + offset * rhs_stride, it is same as
+            // offset * (lhs_stride + rhs_stride).
+            // We can just
+            // add the strides and reuse the offset like this:
+            //   offsets[i] = lhsOffset
+            //   strides[i] = lhsStride + rhsStride
+
+            // Reuse offsets.
+            offsets.push_back(lhsOffset);
+            // Add strides.
+            strides.push_back(addOFRs(lhsStride, rhsStride, loc, builder));
+          }
         }
-        auto newOffset = addOFRs(newLhsOffset, newRhsOffset, loc, builder);
-        offsets.push_back(newOffset);
       } else {
+        // Set stride to 1 when not continuous.
+        strides.push_back(builder.getIndexAttr(1));
+        // New offset is offset * stride.
+        auto newLhsOffset = lhsState.offsets[i];
+        auto newRhsOffset = rhsState.offsets[i];
         // Just propagate the unstructured offset to the result to track the
         // unstructured dimension. The real address calculation will be done
         // later in the PtrAnalysis::visitOperandAddptr.
@@ -432,13 +498,12 @@ LogicalResult PtrState::mulState(const PtrState &lhsState,
       assert(!lhs->dimHasModulo(i) &&
              "should not have non-structured dimension with modulo");
       if (isAnalysisingUnstructured) {
-        auto rhsStride =
-            expandOFRIndex(rhs->scalar, lhs->offsets[i], loc, builder);
         assert(!lhs->hasModulo() &&
                "should not have non-structured dimension with modulo");
-        OpFoldResult newOffset =
-            mulOFRs(lhs->offsets[i], rhsStride, loc, builder);
-        offsets.push_back(newOffset);
+        // Keep offsets as is for unstructured dimension.
+        // The address calculation will be done later in structured to
+        // memref pass.
+        offsets.push_back(lhs->offsets[i]);
         // Mul the scalar to stride.
         OpFoldResult newStride =
             mulOFRs(lhs->strides[i], rhs->scalar, loc, builder);
 
@@ -137,6 +137,27 @@ static OpFoldResult accumulateTargetOffset(Location loc,
   return targetOffset;
 }
 
+static OpFoldResult accumulateTargetOffset(Location loc,
+                                           ArrayRef<OpFoldResult> offsets,
+                                           ArrayRef<OpFoldResult> strides,
+                                           int gatherDim,
+                                           OpBuilder &b) {
+  OpFoldResult targetOffset = b.getIndexAttr(0);
+  for (int i=0;i<offsets.size();i++) {
+
+    OpFoldResult offset = offsets[i];
+    // If this is the gather dimension, multiply the offset by the stride.
+    // Non-gather dimensions are already multiplied by the stride
+    // in the offsets in PtrAnalysis.
+    if (i == gatherDim) {
+      OpFoldResult stride = strides[i];
+      offset = mulOFRs(offset, stride, loc, b);
+    }
+    targetOffset = addOFRs(targetOffset, offset, loc, b);
+  }
+  return targetOffset;
+}
+
 static Value rewriteGatherScatterPtrElement(
     ArrayRef<int64_t> resultShape, tts::MakeGatherScatterTensorPtrOp op,
     Value basePtr, Value gatherOffsetElt, int gatherDim,
@@ -149,7 +170,8 @@ static Value rewriteGatherScatterPtrElement(
 
   auto offsets = op.getMixedOffsets();
   offsets[gatherDim] = gatherOffsetElt;
-  auto targetOffset = accumulateTargetOffset(op.getLoc(), offsets, rewriter);
+  auto targetOffset =
+      accumulateTargetOffset(op.getLoc(), offsets, mixedStrides, gatherDim, rewriter);
 
   auto staticTargetOffset = getIntAttr(targetOffset);
   auto resultType =
 
@@ -219,7 +219,7 @@ def index_select_row_with_double_mod2(input_tensor, indices, dim, mod_offset, mo
     o_stride_m = output_tensor.stride(0)
     o_stride_n = output_tensor.stride(1)
 
-    a = index_select_row_with_double_mod_kernel2[1,](
+    index_select_row_with_double_mod_kernel2[1,](
         input_tensor,
         output_tensor,
         indices,
@@ -235,21 +235,6 @@ def index_select_row_with_double_mod2(input_tensor, indices, dim, mod_offset, mo
     )
     return output_tensor
 
-    index_select_row_with_mod_kernel[1,](
-        input_tensor,
-        output_tensor,
-        indices,
-        stride_i,
-        stride_m,
-        stride_n,
-        o_stride_m,
-        o_stride_n,
-        mod_offset,
-        BLOCK_I=R,
-        BLOCK_N=N,
-    )
-    return output_tensor
-
 
 def test_index_select_row_with_double_mod2(device):
     M, N = 16, 16
 
@@ -34,8 +34,8 @@
 // CHECK-SAME:                           %[[VAL_18:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i32) {
 // CHECK:           %[[VAL_19:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_20:.*]] = arith.constant 8 : i32
-// CHECK:           %[[VAL_21:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_22:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_21:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_22:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_23:.*]] = tensor.empty() : tensor<16x1xi32>
 // CHECK:           %[[VAL_24:.*]] = linalg.fill ins(%[[VAL_20]] : i32) outs(%[[VAL_23]] : tensor<16x1xi32>) -> tensor<16x1xi32>
 // CHECK:           %[[VAL_25:.*]] = arith.muli %[[VAL_5]], %[[VAL_20]] : i32
@@ -69,27 +69,27 @@
 // CHECK:             linalg.yield %[[VAL_52]] : i32
 // CHECK:           } -> tensor<16x1xi32>
 // CHECK:           %[[VAL_53:.*]] = arith.index_cast %[[VAL_6]] : i64 to index
-// CHECK:           %[[VAL_54:.*]] = arith.index_cast %[[VAL_53]] : index to i32
-// CHECK:           %[[VAL_55:.*]] = linalg.fill ins(%[[VAL_54]] : i32) outs(%[[VAL_23]] : tensor<16x1xi32>) -> tensor<16x1xi32>
-// CHECK:           %[[VAL_56:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_48]], %[[VAL_55]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_48]] : tensor<16x1xi32>) {
-// CHECK:           ^bb0(%[[VAL_57:.*]]: i32, %[[VAL_58:.*]]: i32, %[[VAL_59:.*]]: i32):
-// CHECK:             %[[VAL_60:.*]] = arith.muli %[[VAL_57]], %[[VAL_58]] : i32
-// CHECK:             linalg.yield %[[VAL_60]] : i32
+// CHECK:           %[[VAL_54:.*]] = linalg.fill ins(%[[VAL_37]] : i32) outs(%[[VAL_23]] : tensor<16x1xi32>) -> tensor<16x1xi32>
+// CHECK:           %[[VAL_55:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_32]], %[[VAL_54]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_32]] : tensor<16x1xi32>) {
+// CHECK:           ^bb0(%[[VAL_56:.*]]: i32, %[[VAL_57:.*]]: i32, %[[VAL_58:.*]]: i32):
+// CHECK:             %[[VAL_59:.*]] = arith.addi %[[VAL_56]], %[[VAL_57]] : i32
+// CHECK:             linalg.yield %[[VAL_59]] : i32
 // CHECK:           } -> tensor<16x1xi32>
-// CHECK:           %[[VAL_61:.*]] = linalg.fill ins(%[[VAL_37]] : i32) outs(%[[VAL_23]] : tensor<16x1xi32>) -> tensor<16x1xi32>
-// CHECK:           %[[VAL_62:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_32]], %[[VAL_61]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_32]] : tensor<16x1xi32>) {
+// CHECK:           %[[VAL_60:.*]] = arith.index_cast %[[VAL_53]] : index to i32
+// CHECK:           %[[VAL_61:.*]] = linalg.fill ins(%[[VAL_60]] : i32) outs(%[[VAL_23]] : tensor<16x1xi32>) -> tensor<16x1xi32>
+// CHECK:           %[[VAL_62:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_48]], %[[VAL_61]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_48]] : tensor<16x1xi32>) {
 // CHECK:           ^bb0(%[[VAL_63:.*]]: i32, %[[VAL_64:.*]]: i32, %[[VAL_65:.*]]: i32):
-// CHECK:             %[[VAL_66:.*]] = arith.addi %[[VAL_63]], %[[VAL_64]] : i32
+// CHECK:             %[[VAL_66:.*]] = arith.muli %[[VAL_63]], %[[VAL_64]] : i32
 // CHECK:             linalg.yield %[[VAL_66]] : i32
 // CHECK:           } -> tensor<16x1xi32>
 // CHECK:           %[[VAL_67:.*]] = arith.index_cast %[[VAL_46]] : index to i32
 // CHECK:           %[[VAL_68:.*]] = linalg.fill ins(%[[VAL_67]] : i32) outs(%[[VAL_23]] : tensor<16x1xi32>) -> tensor<16x1xi32>
-// CHECK:           %[[VAL_69:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_62]], %[[VAL_68]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_62]] : tensor<16x1xi32>) {
+// CHECK:           %[[VAL_69:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_55]], %[[VAL_68]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_55]] : tensor<16x1xi32>) {
 // CHECK:           ^bb0(%[[VAL_70:.*]]: i32, %[[VAL_71:.*]]: i32, %[[VAL_72:.*]]: i32):
 // CHECK:             %[[VAL_73:.*]] = arith.muli %[[VAL_70]], %[[VAL_71]] : i32
 // CHECK:             linalg.yield %[[VAL_73]] : i32
 // CHECK:           } -> tensor<16x1xi32>
-// CHECK:           %[[VAL_74:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_56]], %[[VAL_69]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_56]] : tensor<16x1xi32>) {
+// CHECK:           %[[VAL_74:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_62]], %[[VAL_69]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_62]] : tensor<16x1xi32>) {
 // CHECK:           ^bb0(%[[VAL_75:.*]]: i32, %[[VAL_76:.*]]: i32, %[[VAL_77:.*]]: i32):
 // CHECK:             %[[VAL_78:.*]] = arith.addi %[[VAL_75]], %[[VAL_76]] : i32
 // CHECK:             linalg.yield %[[VAL_78]] : i32
@@ -103,12 +103,12 @@
 // CHECK:           } -> tensor<16x1xi32>
 // CHECK:           %[[VAL_86:.*]] = tensor.collapse_shape %[[VAL_81]] {{\[\[}}0, 1]] : tensor<16x1xi32> into tensor<16xi32>
 // CHECK:           %[[VAL_87:.*]] = arith.index_cast %[[VAL_25]] : i32 to index
-// CHECK:           %[[VAL_88:.*]] = arith.minsi %[[VAL_87]], %[[VAL_22]] : index
-// CHECK:           %[[VAL_89:.*]] = arith.maxsi %[[VAL_88]], %[[VAL_21]] : index
-// CHECK:           %[[VAL_90:.*]] = arith.minsi %[[VAL_89]], %[[VAL_22]] : index
+// CHECK:           %[[VAL_88:.*]] = arith.minsi %[[VAL_87]], %[[VAL_21]] : index
+// CHECK:           %[[VAL_89:.*]] = arith.maxsi %[[VAL_88]], %[[VAL_22]] : index
+// CHECK:           %[[VAL_90:.*]] = arith.minsi %[[VAL_89]], %[[VAL_21]] : index
 // CHECK:           %[[VAL_91:.*]] = memref.alloc() : memref<16x16xf32>
-// CHECK:           %[[VAL_92:.*]] = arith.minsi %[[VAL_90]], %[[VAL_22]] : index
-// CHECK:           scf.for %[[VAL_93:.*]] = %[[VAL_21]] to %[[VAL_92]] step %[[VAL_19]] {
+// CHECK:           %[[VAL_92:.*]] = arith.minsi %[[VAL_90]], %[[VAL_21]] : index
+// CHECK:           scf.for %[[VAL_93:.*]] = %[[VAL_22]] to %[[VAL_92]] step %[[VAL_19]] {
 // CHECK:             %[[VAL_94:.*]] = tensor.extract %[[VAL_86]]{{\[}}%[[VAL_93]]] : tensor<16xi32>
 // CHECK:             %[[VAL_95:.*]] = arith.index_cast %[[VAL_94]] : i32 to index
 // CHECK:             %[[VAL_96:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_95]]], sizes: [1, 16], strides: [1, 1] : memref<*xf32> to memref<1x16xf32, strided<[1, 1], offset: ?>>
@@ -133,7 +133,7 @@
 // CHECK:           } -> tensor<16x1xi32>
 // CHECK:           %[[VAL_114:.*]] = arith.index_cast %[[VAL_105]] : index to i32
 // CHECK:           %[[VAL_115:.*]] = linalg.fill ins(%[[VAL_114]] : i32) outs(%[[VAL_23]] : tensor<16x1xi32>) -> tensor<16x1xi32>
-// CHECK:           %[[VAL_116:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_62]], %[[VAL_115]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_62]] : tensor<16x1xi32>) {
+// CHECK:           %[[VAL_116:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_1]], #[[$ATTR_1]]], iterator_types = ["parallel", "parallel"]} ins(%[[VAL_55]], %[[VAL_115]] : tensor<16x1xi32>, tensor<16x1xi32>) outs(%[[VAL_55]] : tensor<16x1xi32>) {
 // CHECK:           ^bb0(%[[VAL_117:.*]]: i32, %[[VAL_118:.*]]: i32, %[[VAL_119:.*]]: i32):
 // CHECK:             %[[VAL_120:.*]] = arith.muli %[[VAL_117]], %[[VAL_118]] : i32
 // CHECK:             linalg.yield %[[VAL_120]] : i32
@@ -151,7 +151,7 @@
 // CHECK:             linalg.yield %[[VAL_132]] : i32
 // CHECK:           } -> tensor<16x1xi32>
 // CHECK:           %[[VAL_133:.*]] = tensor.collapse_shape %[[VAL_128]] {{\[\[}}0, 1]] : tensor<16x1xi32> into tensor<16xi32>
-// CHECK:           scf.for %[[VAL_134:.*]] = %[[VAL_21]] to %[[VAL_92]] step %[[VAL_19]] {
+// CHECK:           scf.for %[[VAL_134:.*]] = %[[VAL_22]] to %[[VAL_92]] step %[[VAL_19]] {
 // CHECK:             %[[VAL_135:.*]] = tensor.extract %[[VAL_133]]{{\[}}%[[VAL_134]]] : tensor<16xi32>
 // CHECK:             %[[VAL_136:.*]] = arith.index_cast %[[VAL_135]] : i32 to index
 // CHECK:             %[[VAL_137:.*]] = tensor.extract_slice %[[VAL_99]]{{\[}}%[[VAL_134]], 0] [1, 8] [1, 1] : tensor<16x16xf32> to tensor<1x8xf32>