microsoft
diff --git a/‎include/triton-shared/Analysis/OpFoldResultUtils.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton-shared/Analysis/OpFoldResultUtils.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton-shared/AnalysisStructured/PtrAnalysis.h‎
Lines changed: 23 additions & 0 deletions b/‎include/triton-shared/AnalysisStructured/PtrAnalysis.h‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎include/triton-shared/Dialect/TritonStructured/IR/TritonStructuredDialect.td‎
Lines changed: 76 additions & 0 deletions b/‎include/triton-shared/Dialect/TritonStructured/IR/TritonStructuredDialect.td‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎lib/Analysis/OpFoldResultUtils.cpp‎
Lines changed: 97 additions & 30 deletions b/‎lib/Analysis/OpFoldResultUtils.cpp‎
Lines changed: 97 additions & 30 deletions
diff --git a/‎lib/Analysis/PtrAnalysis.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Analysis/PtrAnalysis.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -35,6 +35,10 @@ Value ofrToIndexValue(const OpFoldResult ofr, const Location loc, OpBuilder &b);
 SmallVector<Value> ofrsToIndexValues(ArrayRef<OpFoldResult> ofrs,
                                      const Location loc, OpBuilder &b);
 
+// Expand index to given type.
+OpFoldResult expandOFRIndex(OpFoldResult ofr, OpFoldResult targetOrfForTy,
+                            const Location loc, OpBuilder &b);
+
 // Process addition of two OFRs. If both OFRs are Integer Attributes, result
 // is an Integer Attribute. Otherwise, insert the arith.addi instruction if
 // needed and use its result Value.
@@ -50,7 +54,7 @@ OpFoldResult subOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
 // Process multiplication of two OFRs. If both OFRs are Integer Attributes,
 // result is an Integer Attribtue. Otherwise, insert the arith.muli
 // instruction if needed and use its result Value.
-OpFoldResult mulOFRValue(const OpFoldResult lhs, const Value rhs,
+OpFoldResult mulOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
                          const Location loc, OpBuilder &b);
 
 OpFoldResult minOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
 
@@ -57,10 +57,31 @@ struct PtrState {
 
   bool dimHasModulo(uint32_t dim) const;
 
+  bool dimIsStructured(uint32_t dim) const;
+  int32_t getNonStructuredDim() const;
+  // When rank is 1, and the only dimension is not continuous.
+  // There's no dimension is continuous.
+  bool noStructuredDim() const;
+
+  bool isStructured() const;
+
   bool isBlockPtr() const;
 
   void dump() const;
 
+  // For unsupported op, save the op to the state.
+  LogicalResult rebuildAsUnsupportedOp(Value op);
+
+  // When merge with other state which is not structured, set the nonContinuous dimension
+  // offset as op.
+  // Still need to make sure the op only contribute to nonContinuousDim.
+  // Fail if the op already mix of different dims.
+  // For case
+  //    add  %remsi(on dim0), %mul(dim1)
+  //    the add will have both dim0 and dim1
+  //    to rebuild use the op, it has to use op[nonContinuousDim] which is not supported.
+  LogicalResult rebuildAsGatherScatter(Value op, int nonContinuousDim);
+
   // Process addition of two PtrStates.
   LogicalResult addState(const PtrState &lhsState, const PtrState &rhsState,
                          Operation *op, OpBuilder &builder);
@@ -71,6 +92,8 @@ struct PtrState {
 
   tts::MakeTensorPtrOp createTTSMakeTensorPtrOp(OpBuilder &builder,
                                                 Location loc);
+  tts::MakeGatherScatterTensorPtrOp
+  createTTSMakeGatherScatterTensorPtrOp(OpBuilder &builder, Location loc);
 };
 
 class PtrAnalysis {
 
@@ -120,6 +120,82 @@ def TTS_MakeTensorPtrOp
   //let hasCanonicalizer = 1;
 }
 
+def TTS_MakeGatherScatterTensorPtrOp
+  : TTS_Op<"make_gather_scatter_tptr", [AttrSizedOperandSegments, Pure]> {
+  // NOTE: Only support cases where the offset for each dimension is defined in a different operation.
+  //       Not support case where the offset is a tensor load from other ptr which for multiple dimension.
+  //
+  //       offset_m = tl.arange(0, M)
+  //       offset_n = tl.arange(0, N)
+  //       offset_k = tl.arange(0, K)
+  //       ld_offsets = tl.load(a_ptr + offset_m[:,None]+offsets_n[None,:])
+  //       not_support = tl.load(b_ptr + ld_offsets)
+  //       not_support2 = tl.load(b_ptr + ld_offsets * (offset_m[:,None]+offsets_n[None,:]))
+  //       not_support3 = tl.load(b_ptr + (ld_offsets * (offset_m[:,None]+offsets_n[None,:]))[:, :, None] + offset_k[None,None,:])
+  //       
+  //       # Support cases where one dimension is structured while the other is not.
+  //       # For example, `offset_m[:, None] // K` is not structured, whereas `offset_n[None, :]` is structured in next line.
+  //       supported = tl.load(b_ptr + offset_m[:, None] // K + offset_n[None, :])
+
+  let summary = "create an pointer that points to a tensor in memory for gather/scatter";
+  let description = [{
+    The `tts.make_gather_scatter_tptr` operation is similar to `tts.make_tptr`.
+    The key difference is that `make_gather_scatter_tptr` accesses the tensor non-continuously.
+    Currently, only one dimension is allowed to be non-continuous.
+    This dimension is saved in `gather_scatter_dim`, and the offset for that dimension is saved in `gather_scatter_offset`.
+    Each contiguous load will load from this offset.
+    Cases with more than one non-continuous dimension are not supported.
+  }];
+
+  // base:    Base pointer used to contruct the tensor of pointers or pointer to tensor.
+  // gather_scatter_offset: The offset for gather/scatter.
+  // gather_scatter_dim: The dimension for gather_scatter_offset.
+  // sizes:   Size of the data being loaded or stored.
+  // strides: The strides of the parent tensor, which means how much to increase the pointer
+  //          by when moving by 1 element in a specific axis.
+  // offsets: Offset of the block along each dimension from base.
+  // result:  If order is present, this op produces a pointer to a tensor; otherwise,
+  //          it produces a tensor of pointers.
+
+  let arguments = (ins TT_Ptr:$base,
+                       I32Tensor:$gather_scatter_offset,
+                       I32Attr:$gather_scatter_dim,
+                       DenseI64ArrayAttr:$sizes,
+                       Variadic<Index>:$strides,
+                       Variadic<Index>:$offsets,
+                       DenseI64ArrayAttr:$static_strides,
+                       DenseI64ArrayAttr:$static_offsets);
+
+  let results = (outs TT_PtrLike:$result);
+
+  let assemblyFormat = [{
+    $base `to` `sizes` `` `:` $sizes
+    `gather_scatter_dim` `` `:` $gather_scatter_dim
+    `gather_scatter_offset` `` `:` $gather_scatter_offset
+    `` `,` `strides` `` `:`
+    custom<DynamicIndexList>($strides, $static_strides)
+    `` `,` `offsets` `` `:`
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    attr-dict `:`  type($gather_scatter_offset) type($base) `to` type($result)
+  }];
+
+
+  let builders = [
+    // Build with mixed static and dynamic entries.
+    OpBuilder<(ins
+      "Value":$base,
+      "Value":$gather_scatter_offset,
+      "int":$gather_scatter_dim,
+      "ArrayRef<int64_t>":$sizes,
+      "ArrayRef<OpFoldResult>":$strides,
+      "ArrayRef<OpFoldResult>":$offsets)>,
+  ];
+
+  // TODO
+  //let hasVerifier = 1;
+  //let hasCanonicalizer = 1;
+}
+
 def TTS_GetStructuredStateOp : TTS_Op<"get_structured_state", [AttrSizedResultSegments, Pure]> {
   let summary = "Placeholder for the structured pointer states computed during PtrAnalysis.";
   let description = "Used to pass the offsets and strides to scf.for op to simplify IR rewrites.";
 
@@ -12,6 +12,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace mlir {
 
@@ -74,6 +75,69 @@ SmallVector<Value> ofrsToIndexValues(ArrayRef<OpFoldResult> ofrs,
       }));
 }
 
+Value indexTypeCast(Value v, Type targetTy, const Location loc, OpBuilder &b) {
+  Type ty = v.getType();
+  if (isa<IndexType>(targetTy) || isa<IndexType>(ty)) {
+    assert((isa<IntegerType>(targetTy) || isa<IntegerType>(ty)) &&
+           "Only cast between index type and integer type");
+    return b.create<arith::IndexCastOp>(loc, targetTy, v).getResult();
+  } else {
+    auto targetIntTy = cast<IntegerType>(targetTy);
+    auto intTy = cast<IntegerType>(ty);
+    if (targetIntTy.getWidth() > intTy.getWidth())
+      return b.create<arith::ExtSIOp>(loc, targetTy, v).getResult();
+    else
+      return b.create<arith::TruncIOp>(loc, targetTy, v).getResult();
+  }
+}
+
+OpFoldResult expandOFRIndex(OpFoldResult ofr, OpFoldResult targetForTy,
+                            const Location loc, OpBuilder &b) {
+  if (getIntAttr(targetForTy))
+    return ofr;
+  Value targetValueForTy = cast<Value>(targetForTy);
+  Type targetTy = targetValueForTy.getType();
+  auto targetShapedTy = dyn_cast<ShapedType>(targetTy);
+
+  Value v = dyn_cast<Value>(ofr);
+  if (!v)
+    v = b.create<arith::ConstantOp>(loc, cast<IntegerAttr>(cast<Attribute>(ofr)));
+
+  Type ty = v.getType();
+  if (targetTy == ty)
+    return ofr;
+
+  auto shapedTy = dyn_cast<ShapedType>(ty);
+  if (targetShapedTy && !shapedTy) {
+    Type targetEltTy = targetShapedTy.getElementType();
+    // cast to target element type first.
+    if (targetEltTy != ty)
+      v = indexTypeCast(v, targetEltTy, loc, b);
+    return b.create<triton::SplatOp>(loc, targetTy, v).getResult();
+  } else if (targetShapedTy && shapedTy) {
+    // TODO: support ShapedType to ShapedType.
+    Type targetEltTy = targetShapedTy.getElementType();
+    Type eltTy = shapedTy.getElementType();
+    if (targetShapedTy.getShape() != shapedTy.getShape())
+      llvm_unreachable("ShapedType to ShapedType must have same shape");
+    if (isa<IndexType>(targetEltTy) || isa<IndexType>(eltTy)) {
+      assert((isa<IntegerType>(targetEltTy) || isa<IntegerType>(eltTy)) &&
+             "Only cast between index type and integer type");
+      return b.create<arith::IndexCastOp>(loc, targetTy, v).getResult();
+    } else {
+      auto targetIntTy = cast<IntegerType>(targetEltTy);
+      auto intTy = cast<IntegerType>(eltTy);
+      if (targetIntTy.getWidth() > intTy.getWidth())
+        return b.create<arith::ExtSIOp>(loc, targetTy, v).getResult();
+      else
+        return b.create<arith::TruncIOp>(loc, targetTy, v).getResult();
+    }
+  } else {
+    assert(!shapedTy && "src type rank should be >= target type rank");
+    return indexTypeCast(v, targetTy, loc, b);
+  }
+}
+
 OpFoldResult addOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
                      const Location loc, OpBuilder &b) {
   auto lhsIntAttr = getIntAttr(lhs);
@@ -95,17 +159,13 @@ OpFoldResult addOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
     auto lhsOp =
         b.create<arith::ConstantOp>(loc, b.getIndexAttr(lhsIntAttr.value()));
     lhsValue = lhsOp.getResult();
-  } else {
-    assert(isa<IndexType>(lhsValue.getType()));
   }
 
   auto rhsValue = dyn_cast<Value>(rhs);
   if (rhsIntAttr) {
     auto rhsOp =
         b.create<arith::ConstantOp>(loc, b.getIndexAttr(rhsIntAttr.value()));
     rhsValue = rhsOp.getResult();
-  } else {
-    assert(isa<IndexType>(lhsValue.getType()));
   }
 
   return b.create<arith::AddIOp>(loc, lhsValue, rhsValue).getResult();
@@ -143,50 +203,57 @@ OpFoldResult subOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
   return sumOp.getResult();
 }
 
-OpFoldResult mulOFRValue(const OpFoldResult lhs, const Value rhs,
+OpFoldResult mulOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
                          const Location loc, OpBuilder &b) {
   auto lhsIntAttr = getIntAttr(lhs);
+  auto rhsIntAttr = getIntAttr(rhs);
 
-  auto rhsIsConst = false;
-  // if rhs is not a const, use max value since min is used to represent
-  // dynamic size or stride
-  auto rhsConstValue = std::numeric_limits<int64_t>::max();
-  auto rhsOp = rhs.getDefiningOp<arith::ConstantOp>();
-  if (rhsOp) {
-    rhsIsConst = true;
-    rhsConstValue = cast<IntegerAttr>(rhsOp.getValue()).getInt();
+  auto lhsValue = dyn_cast<Value>(lhs);
+  if (lhsValue) {
+    if (auto lhsOp = lhsValue.getDefiningOp<arith::ConstantOp>()) {
+      lhsIntAttr = cast<IntegerAttr>(lhsOp.getValue()).getInt();
+    }
+  }
+  auto rhsValue = dyn_cast<Value>(rhs);
+  if (rhsValue) {
+    if (auto rhsOp = rhsValue.getDefiningOp<arith::ConstantOp>()) {
+      rhsIntAttr = cast<IntegerAttr>(rhsOp.getValue()).getInt();
+    }
   }
 
-  // shortcuts for special cases
+  // shortcut for special cases
   if (lhsIntAttr) {
     if (lhsIntAttr.value() == 0)
       return lhs;
     if (lhsIntAttr.value() == 1)
       return rhs;
   }
-  if (rhsIsConst) {
-    if (rhsConstValue == 0)
-      return rhsOp.getResult();
-    if (rhsConstValue == 1)
+
+  if (rhsIntAttr) {
+    if (rhsIntAttr.value() == 0)
+      return rhs;
+    if (rhsIntAttr.value() == 1)
       return lhs;
   }
 
-  // 0. both lhs and rhs are constants
-  if (lhsIntAttr && rhsIsConst)
-    return b.getIndexAttr(lhsIntAttr.value() * rhsConstValue);
+  // both lhs and rhs are constants, return result directly
+  if (lhsIntAttr && rhsIntAttr)
+    return b.getIndexAttr(lhsIntAttr.value() * rhsIntAttr.value());
 
-  // 1. if lhs is constant but rhs is not
-  if (lhsIntAttr && !rhsIsConst) {
-    auto lhsConstOp =
+  // otherwise, need to create instructions to calculate new attribute value
+  if (lhsIntAttr) {
+    auto lhsOp =
         b.create<arith::ConstantOp>(loc, b.getIndexAttr(lhsIntAttr.value()));
-    auto mulOp = b.create<arith::MulIOp>(loc, lhsConstOp.getResult(), rhs);
-    return mulOp.getResult();
+    lhsValue = lhsOp.getResult();
+  }
+
+  if (rhsIntAttr) {
+    auto rhsOp =
+        b.create<arith::ConstantOp>(loc, b.getIndexAttr(rhsIntAttr.value()));
+    rhsValue = rhsOp.getResult();
   }
 
-  // 2. if lhs is not constant
-  assert(!lhsIntAttr);
-  auto mulOp = b.create<arith::MulIOp>(loc, cast<Value>(lhs), rhs);
-  return mulOp.getResult();
+  return b.create<arith::MulIOp>(loc, lhsValue, rhsValue).getResult();
 }
 
 OpFoldResult minOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
 
@@ -130,9 +130,9 @@ void PtrState::mulState(const PtrState &lhsState, const PtrState &rhsState,
 
   for (uint64_t i = 0; i < lhs->sizes.size(); i++) {
     OpFoldResult newOffset =
-        mulOFRValue(lhs->offsets[i], rhs->scalar, loc, rewriter);
+        mulOFRs(lhs->offsets[i], rhs->scalar, loc, rewriter);
     OpFoldResult newStride =
-        mulOFRValue(lhs->strides[i], rhs->scalar, loc, rewriter);
+        mulOFRs(lhs->strides[i], rhs->scalar, loc, rewriter);
     offsets.push_back(newOffset);
     strides.push_back(newStride);
     sizes.push_back(lhs->sizes[i]);