PaddlePaddle
diff --git a/‎paddle/phi/kernels/funcs/fast_ln_v1.h‎
Lines changed: 3 additions & 3 deletions b/‎paddle/phi/kernels/funcs/fast_ln_v1.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu‎
Lines changed: 3 additions & 3 deletions b/‎paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_attention_grad_kernel.cu‎
Lines changed: 25 additions & 25 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_attention_grad_kernel.cu‎
Lines changed: 25 additions & 25 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_attention_kernel.cu‎
Lines changed: 19 additions & 19 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_attention_kernel.cu‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu‎
Lines changed: 3 additions & 3 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h‎
Lines changed: 14 additions & 14 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_feedforward_grad_kernel.cu‎
Lines changed: 19 additions & 19 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_feedforward_grad_kernel.cu‎
Lines changed: 19 additions & 19 deletions
@@ -65,8 +65,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_v1_fwd_kernel(
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
     if (col < cols) {
-      phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-      phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+      Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+      Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
     } else {
       gamma[it] = Vec_scale{};
       beta[it] = Vec_scale{};
@@ -80,7 +80,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_v1_fwd_kernel(
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
       if (col < cols) {
-        phi::Load<T, VecSize>(
+        Load<T, VecSize>(
             x_ptr + static_cast<int64_t>(row) * ELTS_PER_ROW + col * VecSize,
             &x[it]);
       } else {
 
@@ -280,16 +280,16 @@ __global__ void DequantKernel(T* output,
   AlignedVector<T, VecSize> out_vec;
 
   for (; idx < numel; idx += stride) {
-    phi::Load<int32_t, VecSize>(input + idx, &in_vec);
-    phi::Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);
+    Load<int32_t, VecSize>(input + idx, &in_vec);
+    Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);
 
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
       out_vec[i] =
           static_cast<T>(static_cast<float>(in_vec[i]) * out_scale_vec[i]);
     }
 
-    phi::Store<T, VecSize>(out_vec, output + idx);
+    Store<T, VecSize>(out_vec, output + idx);
   }
 }
 
 
@@ -167,13 +167,13 @@ void FusedAttentionGradKernel(
 
   const bool is_upscale_in_train =
       (dropout_implementation == "upscale_in_train");
-  phi::fusion::DropoutParam dropout_param2(dropout_fix_seed,
-                                           0,
-                                           is_test,
-                                           is_upscale_in_train,
-                                           dropout_rate,
-                                           nullptr,
-                                           dropout_seed);
+  fusion::DropoutParam dropout_param2(dropout_fix_seed,
+                                      0,
+                                      is_test,
+                                      is_upscale_in_train,
+                                      dropout_rate,
+                                      nullptr,
+                                      dropout_seed);
   const bool has_dropout = (dropout_param2.dropout_prob != 0.0f);
 
   bool is_upscale_in_train_1 =
@@ -324,31 +324,31 @@ void FusedAttentionGradKernel(
   bool transB = transpose_qkv_wb ? false : true;
   bool compute_qkv_bias = qkv_bias_p ? true : false;
   auto layer_norm_compute =
-      phi::fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
-  auto qkv_compute = phi::fusion::AttnMatMul<T>(dev_ctx,
-                                                transA,
-                                                transB,
-                                                bsz_seq,
-                                                output_size,
-                                                input_size,
-                                                compute_qkv_bias);
-  phi::fusion::AttnDropoutParam attn_dropout_param(is_test,
-                                                   attn_dropout_implementation,
-                                                   attn_dropout_rate,
-                                                   is_upscale_in_train_1,
-                                                   attn_dropout_fix_seed,
-                                                   attn_dropout_seed,
-                                                   seed_1);
-  auto fmha_ref_compute = phi::fusion::FMHARef<T>(
+      fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
+  auto qkv_compute = fusion::AttnMatMul<T>(dev_ctx,
+                                           transA,
+                                           transB,
+                                           bsz_seq,
+                                           output_size,
+                                           input_size,
+                                           compute_qkv_bias);
+  fusion::AttnDropoutParam attn_dropout_param(is_test,
+                                              attn_dropout_implementation,
+                                              attn_dropout_rate,
+                                              is_upscale_in_train_1,
+                                              attn_dropout_fix_seed,
+                                              attn_dropout_seed,
+                                              seed_1);
+  auto fmha_ref_compute = fusion::FMHARef<T>(
       dev_ctx, batch_size, max_seq_len, num_head, dim_head, attn_dropout_param);
   output_size = hidden_size;
   transA = false;
   transB = false;
   bool compute_bias = false;
   // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed)
-  auto out_linear_compute = phi::fusion::AttnMatMul<T>(
+  auto out_linear_compute = fusion::AttnMatMul<T>(
       dev_ctx, transA, transB, bsz_seq, input_size, output_size, compute_bias);
-  phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
+  fusion::FusedDropoutLayerNormHelper<T, uint8_t>
       fused_dropout_layernorm_helper(
           dev_ctx, bsz_seq, dim_embed, dropout_param2, ln_epsilon);
 
 
@@ -139,13 +139,13 @@ void FusedAttentionKernel(const Context &dev_ctx,
 
   const bool is_upscale_in_train =
       (dropout_implementation == "upscale_in_train");
-  phi::fusion::DropoutParam dropout_param2(dropout_fix_seed,
-                                           0,
-                                           is_test,
-                                           is_upscale_in_train,
-                                           dropout_rate,
-                                           nullptr,
-                                           dropout_seed);
+  fusion::DropoutParam dropout_param2(dropout_fix_seed,
+                                      0,
+                                      is_test,
+                                      is_upscale_in_train,
+                                      dropout_rate,
+                                      nullptr,
+                                      dropout_seed);
 
   const bool has_dropout = (dropout_param2.dropout_prob != 0.0f);
 
@@ -240,25 +240,25 @@ void FusedAttentionKernel(const Context &dev_ctx,
   int input_size = dim_embed;
 
   auto layer_norm_compute =
-      phi::fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
+      fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
 
   bool compute_bias = true;
   if (qkv_bias_p == nullptr) {
     compute_bias = false;
   }
   // (transA, transB, compute_bias) = (false, true, true)
   bool transB = transpose_qkv_wb ? false : true;
-  auto qkv_compute = phi::fusion::AttnMatMul<T>(
+  auto qkv_compute = fusion::AttnMatMul<T>(
       dev_ctx, false, transB, bsz_seq, output_size, input_size, compute_bias);
 
-  phi::fusion::AttnDropoutParam attn_dropout_param(is_test,
-                                                   attn_dropout_implementation,
-                                                   attn_dropout_rate,
-                                                   is_upscale_in_train_1,
-                                                   attn_dropout_fix_seed,
-                                                   attn_dropout_seed,
-                                                   seed_1);
-  auto fmha_ref_compute = phi::fusion::FMHARef<T>(
+  fusion::AttnDropoutParam attn_dropout_param(is_test,
+                                              attn_dropout_implementation,
+                                              attn_dropout_rate,
+                                              is_upscale_in_train_1,
+                                              attn_dropout_fix_seed,
+                                              attn_dropout_seed,
+                                              seed_1);
+  auto fmha_ref_compute = fusion::FMHARef<T>(
       dev_ctx, batch_size, max_seq_len, num_head, dim_head, attn_dropout_param);
 
   output_size = hidden_size;
@@ -268,9 +268,9 @@ void FusedAttentionKernel(const Context &dev_ctx,
   // which is actually the input size. While the input size is hidden size,
   // which is actually the output size. So for out linear, switch the
   // input size and output size.
-  auto out_linear_compute = phi::fusion::AttnMatMul<T>(
+  auto out_linear_compute = fusion::AttnMatMul<T>(
       dev_ctx, false, false, bsz_seq, input_size, output_size, false);
-  phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
+  fusion::FusedDropoutLayerNormHelper<T, uint8_t>
       fused_dropout_layernorm_helper(
           dev_ctx, bsz_seq, dim_embed, dropout_param2, ln_epsilon);
 
 
@@ -50,8 +50,8 @@ __global__ void ActFFNGlu(const T *bias,
     load_func.template load<VecSize>(&src_vec2, index + hid_dim);
 
     if (bias) {
-      phi::Load<T, VecSize>(&bias[idx], &bias_vec1);
-      phi::Load<T, VecSize>(&bias[idx + hid_dim], &bias_vec2);
+      Load<T, VecSize>(&bias[idx], &bias_vec1);
+      Load<T, VecSize>(&bias[idx + hid_dim], &bias_vec2);
     }
 #pragma unroll
     for (int j = 0; j < VecSize; j++) {
@@ -134,7 +134,7 @@ __global__ void BiasAct(const T *bias,
     int64_t linear_idx = row_idx * cols + col_idx;
     load_func.template load<VecSize>(&src_vec, linear_idx);
     if (bias) {
-      phi::Load<T, VecSize>(&bias[col_idx], &bias_vec);
+      Load<T, VecSize>(&bias[col_idx], &bias_vec);
     }
 #pragma unroll
     for (int j = 0; j < VecSize; j++) {
 
@@ -102,15 +102,15 @@ void FusedBiasDropoutResidualLnGradKernel(
     bsz_seq *= input_x_dims[i];
   }
   int64_t dim_embed = input_x_dims[input_x_dims.size() - 1];
-  phi::fusion::DropoutParam dropout_param(
+  fusion::DropoutParam dropout_param(
       dropout_fix_seed,
       0,
       is_test,
       dropout_implementation == "upscale_in_train",
       dropout_rate,
       nullptr,
       dropout_seed);
-  phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
+  fusion::FusedDropoutLayerNormHelper<T, uint8_t>
       fused_dropout_layernorm_helper(
           dev_ctx, bsz_seq, dim_embed, dropout_param, ln_epsilon);
   fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
 
@@ -68,15 +68,15 @@ void FusedBiasDropoutResidualLnKernel(const Context& dev_ctx,
     bsz_seq *= input_x_dims[i];
   }
   int dim_embed = input_x_dims[input_x_dims.size() - 1];
-  phi::fusion::DropoutParam dropout_param(
+  fusion::DropoutParam dropout_param(
       dropout_fix_seed,
       0,
       is_test,
       dropout_implementation == "upscale_in_train",
       dropout_rate,
       nullptr,
       dropout_seed);
-  phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
+  fusion::FusedDropoutLayerNormHelper<T, uint8_t>
       fused_dropout_layernorm_helper(
           dev_ctx, bsz_seq, dim_embed, dropout_param, ln_epsilon);
   // output = layernorm(residual + dropout(input + bias))
 
@@ -162,11 +162,11 @@ __global__ void FusedActBias(Functor act,
        idx < elem_cnt;
        idx += step) {
     const int32_t col_idx = idx % cols;
-    phi::Load<InType, VecSize>(&src[idx], &src_vec);
-    phi::Load<float, VecSize>(&dequant_out_scale_data[col_idx],
-                              &dequant_out_scale_vec);
+    Load<InType, VecSize>(&src[idx], &src_vec);
+    Load<float, VecSize>(&dequant_out_scale_data[col_idx],
+                         &dequant_out_scale_vec);
     if (bias) {
-      phi::Load<T, VecSize>(&bias[col_idx], &bias_vec);
+      Load<T, VecSize>(&bias[col_idx], &bias_vec);
     }
 #pragma unroll
     for (int32_t unroll_idx = 0; unroll_idx < VecSize; unroll_idx++) {
@@ -194,7 +194,7 @@ __global__ void FusedActBias(Functor act,
         }
       }
     }
-    phi::Store<OutType, VecSize>(out_vec, &dst[idx]);
+    Store<OutType, VecSize>(out_vec, &dst[idx]);
   }
 }
 
@@ -322,17 +322,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad,
     LoadT src_vec;
     MaskLoadT mask_vec;
 
-    phi::Load<T, VecSize>(&dout[i], &dout_vec);
-    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
-    phi::Load<T, VecSize>(&src[i], &src_vec);
+    Load<T, VecSize>(&dout[i], &dout_vec);
+    Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    Load<T, VecSize>(&src[i], &src_vec);
 
     StoreT dx_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
       T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
       dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
     }
-    phi::Store<T, VecSize>(dx_vec, &dx[i]);
+    Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -376,10 +376,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
       LoadT bias_vec;
       MaskLoadT mask_vec;
 
-      phi::Load<T, VecSize>(&dout[index], &dout_vec);
-      phi::Load<T, VecSize>(&src[index], &src_vec);
-      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
-      phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
+      Load<T, VecSize>(&dout[index], &dout_vec);
+      Load<T, VecSize>(&src[index], &src_vec);
+      Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
 
       StoreT dx_vec;
 #pragma unroll
@@ -390,7 +390,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
         dx_vec[i] = val;
         tmp_sum[i] += val;
       }
-      phi::Store<T, VecSize>(dx_vec, &dx[index]);
+      Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
 
@@ -77,19 +77,19 @@ void FFNGrad(const GPUContext& dev_ctx,
              const int bsz_seq,
              const int d_model,
              const int dim_feedforward,
-             const phi::fusion::DropoutParam& dropout_param1,
-             const phi::fusion::DropoutParam& dropout_param2,
+             const fusion::DropoutParam& dropout_param1,
+             const fusion::DropoutParam& dropout_param2,
              const std::string& act_method,
              const bool pre_layer_norm,
              const float epsilon1,
              const float epsilon2,
              const bool add_residual,
              const int ring_id) {
-  phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
+  fusion::FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
       bsz_seq, d_model, epsilon1);
-  phi::fusion::FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
+  fusion::FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
       dev_ctx, bsz_seq, dim_feedforward, dropout_param1);
-  phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
+  fusion::FusedDropoutLayerNormHelper<T, uint8_t>
       fused_dropout_layernorm_helper(
           dev_ctx, bsz_seq, d_model, dropout_param2, epsilon2);
 
@@ -283,20 +283,20 @@ void FusedFeedForwardGradKernel(const Context& dev_ctx,
   bool is_upscale_in_train1 = dropout1_implementation == "upscale_in_train";
   bool is_upscale_in_train2 = dropout2_implementation == "upscale_in_train";
 
-  phi::fusion::DropoutParam dropout_param1(dropout1_fix_seed,
-                                           0,
-                                           is_test,
-                                           is_upscale_in_train1,
-                                           dropout1_prob,
-                                           nullptr,
-                                           dropout1_seed_val);
-  phi::fusion::DropoutParam dropout_param2(dropout2_fix_seed,
-                                           0,
-                                           is_test,
-                                           is_upscale_in_train2,
-                                           dropout2_prob,
-                                           nullptr,
-                                           dropout2_seed_val);
+  fusion::DropoutParam dropout_param1(dropout1_fix_seed,
+                                      0,
+                                      is_test,
+                                      is_upscale_in_train1,
+                                      dropout1_prob,
+                                      nullptr,
+                                      dropout1_seed_val);
+  fusion::DropoutParam dropout_param2(dropout2_fix_seed,
+                                      0,
+                                      is_test,
+                                      is_upscale_in_train2,
+                                      dropout2_prob,
+                                      nullptr,
+                                      dropout2_seed_val);
 
   dev_ctx.template Alloc<T>(d_x, d_x->numel() * sizeof(T));
   if (d_ln1_scale) {
Original file line number	Diff line number	Diff line change
`@@ -280,16 +280,16 @@ __global__ void DequantKernel(T* output,`
`280`	`280`	`AlignedVector<T, VecSize> out_vec;`
`281`	`281`
`282`	`282`	`for (; idx < numel; idx += stride) {`
`283`		`- phi::Load<int32_t, VecSize>(input + idx, &in_vec);`
`284`		`- phi::Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);`
	`283`	`+ Load<int32_t, VecSize>(input + idx, &in_vec);`
	`284`	`+ Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);`
`285`	`285`
`286`	`286`	`#pragma unroll`
`287`	`287`	`for (int i = 0; i < VecSize; ++i) {`
`288`	`288`	`out_vec[i] =`
`289`	`289`	`static_cast<T>(static_cast<float>(in_vec[i]) * out_scale_vec[i]);`
`290`	`290`	`}`
`291`	`291`
`292`		`- phi::Store<T, VecSize>(out_vec, output + idx);`
	`292`	`+ Store<T, VecSize>(out_vec, output + idx);`
`293`	`293`	`}`
`294`	`294`	`}`
`295`	`295`
Original file line number	Diff line number	Diff line change
`@@ -162,11 +162,11 @@ __global__ void FusedActBias(Functor act,`
`162`	`162`	`idx < elem_cnt;`
`163`	`163`	`idx += step) {`
`164`	`164`	`const int32_t col_idx = idx % cols;`
`165`		`- phi::Load<InType, VecSize>(&src[idx], &src_vec);`
`166`		`- phi::Load<float, VecSize>(&dequant_out_scale_data[col_idx],`
`167`		`- &dequant_out_scale_vec);`
	`165`	`+ Load<InType, VecSize>(&src[idx], &src_vec);`
	`166`	`+ Load<float, VecSize>(&dequant_out_scale_data[col_idx],`
	`167`	`+ &dequant_out_scale_vec);`
`168`	`168`	`if (bias) {`
`169`		`- phi::Load<T, VecSize>(&bias[col_idx], &bias_vec);`
	`169`	`+ Load<T, VecSize>(&bias[col_idx], &bias_vec);`
`170`	`170`	`}`
`171`	`171`	`#pragma unroll`
`172`	`172`	`for (int32_t unroll_idx = 0; unroll_idx < VecSize; unroll_idx++) {`
`@@ -194,7 +194,7 @@ __global__ void FusedActBias(Functor act,`
`194`	`194`	`}`
`195`	`195`	`}`
`196`	`196`	`}`
`197`		`- phi::Store<OutType, VecSize>(out_vec, &dst[idx]);`
	`197`	`+ Store<OutType, VecSize>(out_vec, &dst[idx]);`
`198`	`198`	`}`
`199`	`199`	`}`
`200`	`200`
`@@ -322,17 +322,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad,`
`322`	`322`	`LoadT src_vec;`
`323`	`323`	`MaskLoadT mask_vec;`
`324`	`324`
`325`		`- phi::Load<T, VecSize>(&dout[i], &dout_vec);`
`326`		`- phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);`
`327`		`- phi::Load<T, VecSize>(&src[i], &src_vec);`
	`325`	`+ Load<T, VecSize>(&dout[i], &dout_vec);`
	`326`	`+ Load<MaskType, VecSize>(&mask[i], &mask_vec);`
	`327`	`+ Load<T, VecSize>(&src[i], &src_vec);`
`328`	`328`
`329`	`329`	`StoreT dx_vec;`
`330`	`330`	`#pragma unroll`
`331`	`331`	`for (int ii = 0; ii < VecSize; ii++) {`
`332`	`332`	`T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;`
`333`	`333`	`dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);`
`334`	`334`	`}`
`335`		`- phi::Store<T, VecSize>(dx_vec, &dx[i]);`
	`335`	`+ Store<T, VecSize>(dx_vec, &dx[i]);`
`336`	`336`	`}`
`337`	`337`	`}`
`338`	`338`
`@@ -376,10 +376,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(`
`376`	`376`	`LoadT bias_vec;`
`377`	`377`	`MaskLoadT mask_vec;`
`378`	`378`
`379`		`- phi::Load<T, VecSize>(&dout[index], &dout_vec);`
`380`		`- phi::Load<T, VecSize>(&src[index], &src_vec);`
`381`		`- phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);`
`382`		`- phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);`
	`379`	`+ Load<T, VecSize>(&dout[index], &dout_vec);`
	`380`	`+ Load<T, VecSize>(&src[index], &src_vec);`
	`381`	`+ Load<MaskType, VecSize>(&mask[index], &mask_vec);`
	`382`	`+ Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);`
`383`	`383`
`384`	`384`	`StoreT dx_vec;`
`385`	`385`	`#pragma unroll`
`@@ -390,7 +390,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(`
`390`	`390`	`dx_vec[i] = val;`
`391`	`391`	`tmp_sum[i] += val;`
`392`	`392`	`}`
`393`		`- phi::Store<T, VecSize>(dx_vec, &dx[index]);`
	`393`	`+ Store<T, VecSize>(dx_vec, &dx[index]);`
`394`	`394`	`}`
`395`	`395`	`}`
`396`	`396`