diff --git a/paddle/phi/kernels/funcs/fast_ln_v1.h b/paddle/phi/kernels/funcs/fast_ln_v1.h index 64a58af2531c90..bf077030ca9145 100644 --- a/paddle/phi/kernels/funcs/fast_ln_v1.h +++ b/paddle/phi/kernels/funcs/fast_ln_v1.h @@ -65,8 +65,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_v1_fwd_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { if (col < cols) { - phi::Load(gamma_ptr + col * VecSize, &gamma[it]); - phi::Load(beta_ptr + col * VecSize, &beta[it]); + Load(gamma_ptr + col * VecSize, &gamma[it]); + Load(beta_ptr + col * VecSize, &beta[it]); } else { gamma[it] = Vec_scale{}; beta[it] = Vec_scale{}; @@ -80,7 +80,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_v1_fwd_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { if (col < cols) { - phi::Load( + Load( x_ptr + static_cast(row) * ELTS_PER_ROW + col * VecSize, &x[it]); } else { diff --git a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu index f223b0d3abc1cb..b2cb160e97ff1e 100644 --- a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu @@ -280,8 +280,8 @@ __global__ void DequantKernel(T* output, AlignedVector out_vec; for (; idx < numel; idx += stride) { - phi::Load(input + idx, &in_vec); - phi::Load(dequant_out_scale_data + col_id, &out_scale_vec); + Load(input + idx, &in_vec); + Load(dequant_out_scale_data + col_id, &out_scale_vec); #pragma unroll for (int i = 0; i < VecSize; ++i) { @@ -289,7 +289,7 @@ __global__ void DequantKernel(T* output, static_cast(static_cast(in_vec[i]) * out_scale_vec[i]); } - phi::Store(out_vec, output + idx); + Store(out_vec, output + idx); } } diff --git a/paddle/phi/kernels/fusion/gpu/fused_attention_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_attention_grad_kernel.cu index 9a7f39dd6ccd45..35da73744de65f 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_attention_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_attention_grad_kernel.cu @@ -167,13 +167,13 @@ void FusedAttentionGradKernel( const bool is_upscale_in_train = (dropout_implementation == "upscale_in_train"); - phi::fusion::DropoutParam dropout_param2(dropout_fix_seed, - 0, - is_test, - is_upscale_in_train, - dropout_rate, - nullptr, - dropout_seed); + fusion::DropoutParam dropout_param2(dropout_fix_seed, + 0, + is_test, + is_upscale_in_train, + dropout_rate, + nullptr, + dropout_seed); const bool has_dropout = (dropout_param2.dropout_prob != 0.0f); bool is_upscale_in_train_1 = @@ -324,31 +324,31 @@ void FusedAttentionGradKernel( bool transB = transpose_qkv_wb ? false : true; bool compute_qkv_bias = qkv_bias_p ? true : false; auto layer_norm_compute = - phi::fusion::AttnLayerNorm(dev_ctx, epsilon, bsz_seq, dim_embed); - auto qkv_compute = phi::fusion::AttnMatMul(dev_ctx, - transA, - transB, - bsz_seq, - output_size, - input_size, - compute_qkv_bias); - phi::fusion::AttnDropoutParam attn_dropout_param(is_test, - attn_dropout_implementation, - attn_dropout_rate, - is_upscale_in_train_1, - attn_dropout_fix_seed, - attn_dropout_seed, - seed_1); - auto fmha_ref_compute = phi::fusion::FMHARef( + fusion::AttnLayerNorm(dev_ctx, epsilon, bsz_seq, dim_embed); + auto qkv_compute = fusion::AttnMatMul(dev_ctx, + transA, + transB, + bsz_seq, + output_size, + input_size, + compute_qkv_bias); + fusion::AttnDropoutParam attn_dropout_param(is_test, + attn_dropout_implementation, + attn_dropout_rate, + is_upscale_in_train_1, + attn_dropout_fix_seed, + attn_dropout_seed, + seed_1); + auto fmha_ref_compute = fusion::FMHARef( dev_ctx, batch_size, max_seq_len, num_head, dim_head, attn_dropout_param); output_size = hidden_size; transA = false; transB = false; bool compute_bias = false; // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed) - auto out_linear_compute = phi::fusion::AttnMatMul( + auto out_linear_compute = fusion::AttnMatMul( dev_ctx, transA, transB, bsz_seq, input_size, output_size, compute_bias); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( dev_ctx, bsz_seq, dim_embed, dropout_param2, ln_epsilon); diff --git a/paddle/phi/kernels/fusion/gpu/fused_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_attention_kernel.cu index b91d9b9c005b79..9360389a764c16 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_attention_kernel.cu @@ -139,13 +139,13 @@ void FusedAttentionKernel(const Context &dev_ctx, const bool is_upscale_in_train = (dropout_implementation == "upscale_in_train"); - phi::fusion::DropoutParam dropout_param2(dropout_fix_seed, - 0, - is_test, - is_upscale_in_train, - dropout_rate, - nullptr, - dropout_seed); + fusion::DropoutParam dropout_param2(dropout_fix_seed, + 0, + is_test, + is_upscale_in_train, + dropout_rate, + nullptr, + dropout_seed); const bool has_dropout = (dropout_param2.dropout_prob != 0.0f); @@ -240,7 +240,7 @@ void FusedAttentionKernel(const Context &dev_ctx, int input_size = dim_embed; auto layer_norm_compute = - phi::fusion::AttnLayerNorm(dev_ctx, epsilon, bsz_seq, dim_embed); + fusion::AttnLayerNorm(dev_ctx, epsilon, bsz_seq, dim_embed); bool compute_bias = true; if (qkv_bias_p == nullptr) { @@ -248,17 +248,17 @@ void FusedAttentionKernel(const Context &dev_ctx, } // (transA, transB, compute_bias) = (false, true, true) bool transB = transpose_qkv_wb ? false : true; - auto qkv_compute = phi::fusion::AttnMatMul( + auto qkv_compute = fusion::AttnMatMul( dev_ctx, false, transB, bsz_seq, output_size, input_size, compute_bias); - phi::fusion::AttnDropoutParam attn_dropout_param(is_test, - attn_dropout_implementation, - attn_dropout_rate, - is_upscale_in_train_1, - attn_dropout_fix_seed, - attn_dropout_seed, - seed_1); - auto fmha_ref_compute = phi::fusion::FMHARef( + fusion::AttnDropoutParam attn_dropout_param(is_test, + attn_dropout_implementation, + attn_dropout_rate, + is_upscale_in_train_1, + attn_dropout_fix_seed, + attn_dropout_seed, + seed_1); + auto fmha_ref_compute = fusion::FMHARef( dev_ctx, batch_size, max_seq_len, num_head, dim_head, attn_dropout_param); output_size = hidden_size; @@ -268,9 +268,9 @@ void FusedAttentionKernel(const Context &dev_ctx, // which is actually the input size. While the input size is hidden size, // which is actually the output size. So for out linear, switch the // input size and output size. - auto out_linear_compute = phi::fusion::AttnMatMul( + auto out_linear_compute = fusion::AttnMatMul( dev_ctx, false, false, bsz_seq, input_size, output_size, false); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( dev_ctx, bsz_seq, dim_embed, dropout_param2, ln_epsilon); diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu index 4878ed4b7956cb..81247cffa4fdf8 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu @@ -50,8 +50,8 @@ __global__ void ActFFNGlu(const T *bias, load_func.template load(&src_vec2, index + hid_dim); if (bias) { - phi::Load(&bias[idx], &bias_vec1); - phi::Load(&bias[idx + hid_dim], &bias_vec2); + Load(&bias[idx], &bias_vec1); + Load(&bias[idx + hid_dim], &bias_vec2); } #pragma unroll for (int j = 0; j < VecSize; j++) { @@ -134,7 +134,7 @@ __global__ void BiasAct(const T *bias, int64_t linear_idx = row_idx * cols + col_idx; load_func.template load(&src_vec, linear_idx); if (bias) { - phi::Load(&bias[col_idx], &bias_vec); + Load(&bias[col_idx], &bias_vec); } #pragma unroll for (int j = 0; j < VecSize; j++) { diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu index 9580d500218e0c..932b494f8a9490 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu @@ -102,7 +102,7 @@ void FusedBiasDropoutResidualLnGradKernel( bsz_seq *= input_x_dims[i]; } int64_t dim_embed = input_x_dims[input_x_dims.size() - 1]; - phi::fusion::DropoutParam dropout_param( + fusion::DropoutParam dropout_param( dropout_fix_seed, 0, is_test, @@ -110,7 +110,7 @@ void FusedBiasDropoutResidualLnGradKernel( dropout_rate, nullptr, dropout_seed); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( dev_ctx, bsz_seq, dim_embed, dropout_param, ln_epsilon); fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad( diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu index 23db31e0c3e14d..82f119981c92f0 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu @@ -68,7 +68,7 @@ void FusedBiasDropoutResidualLnKernel(const Context& dev_ctx, bsz_seq *= input_x_dims[i]; } int dim_embed = input_x_dims[input_x_dims.size() - 1]; - phi::fusion::DropoutParam dropout_param( + fusion::DropoutParam dropout_param( dropout_fix_seed, 0, is_test, @@ -76,7 +76,7 @@ void FusedBiasDropoutResidualLnKernel(const Context& dev_ctx, dropout_rate, nullptr, dropout_seed); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( dev_ctx, bsz_seq, dim_embed, dropout_param, ln_epsilon); // output = layernorm(residual + dropout(input + bias)) diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h index c1c007a966c076..193260a4af0781 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h @@ -162,11 +162,11 @@ __global__ void FusedActBias(Functor act, idx < elem_cnt; idx += step) { const int32_t col_idx = idx % cols; - phi::Load(&src[idx], &src_vec); - phi::Load(&dequant_out_scale_data[col_idx], - &dequant_out_scale_vec); + Load(&src[idx], &src_vec); + Load(&dequant_out_scale_data[col_idx], + &dequant_out_scale_vec); if (bias) { - phi::Load(&bias[col_idx], &bias_vec); + Load(&bias[col_idx], &bias_vec); } #pragma unroll for (int32_t unroll_idx = 0; unroll_idx < VecSize; unroll_idx++) { @@ -194,7 +194,7 @@ __global__ void FusedActBias(Functor act, } } } - phi::Store(out_vec, &dst[idx]); + Store(out_vec, &dst[idx]); } } @@ -322,9 +322,9 @@ __global__ void FusedDropoutActGrad(Functor act_grad, LoadT src_vec; MaskLoadT mask_vec; - phi::Load(&dout[i], &dout_vec); - phi::Load(&mask[i], &mask_vec); - phi::Load(&src[i], &src_vec); + Load(&dout[i], &dout_vec); + Load(&mask[i], &mask_vec); + Load(&src[i], &src_vec); StoreT dx_vec; #pragma unroll @@ -332,7 +332,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, T tmp = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]); } - phi::Store(dx_vec, &dx[i]); + Store(dx_vec, &dx[i]); } } @@ -376,10 +376,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad( LoadT bias_vec; MaskLoadT mask_vec; - phi::Load(&dout[index], &dout_vec); - phi::Load(&src[index], &src_vec); - phi::Load(&mask[index], &mask_vec); - phi::Load(&bias[col_id * VecSize], &bias_vec); + Load(&dout[index], &dout_vec); + Load(&src[index], &src_vec); + Load(&mask[index], &mask_vec); + Load(&bias[col_id * VecSize], &bias_vec); StoreT dx_vec; #pragma unroll @@ -390,7 +390,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad( dx_vec[i] = val; tmp_sum[i] += val; } - phi::Store(dx_vec, &dx[index]); + Store(dx_vec, &dx[index]); } } diff --git a/paddle/phi/kernels/fusion/gpu/fused_feedforward_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_feedforward_grad_kernel.cu index db91cc179404ef..ae5a7be476089c 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_feedforward_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_feedforward_grad_kernel.cu @@ -77,19 +77,19 @@ void FFNGrad(const GPUContext& dev_ctx, const int bsz_seq, const int d_model, const int dim_feedforward, - const phi::fusion::DropoutParam& dropout_param1, - const phi::fusion::DropoutParam& dropout_param2, + const fusion::DropoutParam& dropout_param1, + const fusion::DropoutParam& dropout_param2, const std::string& act_method, const bool pre_layer_norm, const float epsilon1, const float epsilon2, const bool add_residual, const int ring_id) { - phi::fusion::FusedDropoutLayerNormHelper pre_layernorm_helper( + fusion::FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); - phi::fusion::FusedDropoutHelper fused_act_dropout_helper( + fusion::FusedDropoutHelper fused_act_dropout_helper( dev_ctx, bsz_seq, dim_feedforward, dropout_param1); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( dev_ctx, bsz_seq, d_model, dropout_param2, epsilon2); @@ -283,20 +283,20 @@ void FusedFeedForwardGradKernel(const Context& dev_ctx, bool is_upscale_in_train1 = dropout1_implementation == "upscale_in_train"; bool is_upscale_in_train2 = dropout2_implementation == "upscale_in_train"; - phi::fusion::DropoutParam dropout_param1(dropout1_fix_seed, - 0, - is_test, - is_upscale_in_train1, - dropout1_prob, - nullptr, - dropout1_seed_val); - phi::fusion::DropoutParam dropout_param2(dropout2_fix_seed, - 0, - is_test, - is_upscale_in_train2, - dropout2_prob, - nullptr, - dropout2_seed_val); + fusion::DropoutParam dropout_param1(dropout1_fix_seed, + 0, + is_test, + is_upscale_in_train1, + dropout1_prob, + nullptr, + dropout1_seed_val); + fusion::DropoutParam dropout_param2(dropout2_fix_seed, + 0, + is_test, + is_upscale_in_train2, + dropout2_prob, + nullptr, + dropout2_seed_val); dev_ctx.template Alloc(d_x, d_x->numel() * sizeof(T)); if (d_ln1_scale) { diff --git a/paddle/phi/kernels/fusion/gpu/fused_feedforward_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_feedforward_kernel.cu index 65e3cfdc30f1d0..d8e509d58490b6 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_feedforward_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_feedforward_kernel.cu @@ -71,13 +71,13 @@ void FFN(const GPUContext& dev_ctx, const float epsilon2, const bool add_residual, const int ring_id, - const phi::fusion::DropoutParam& dropout_param1, - const phi::fusion::DropoutParam& dropout_param2) { - phi::fusion::FusedDropoutLayerNormHelper pre_layernorm_helper( + const fusion::DropoutParam& dropout_param1, + const fusion::DropoutParam& dropout_param2) { + fusion::FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); - phi::fusion::FusedDropoutHelper fused_act_dropout_helper( + fusion::FusedDropoutHelper fused_act_dropout_helper( dev_ctx, bsz_seq, dim_feedforward, dropout_param1); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( dev_ctx, bsz_seq, d_model, dropout_param2, epsilon2); @@ -216,20 +216,20 @@ void FusedFeedForwardKernel(const Context& dev_ctx, auto* dropout1_seed_ptr = dropout1_seed.get_ptr(); auto* dropout2_seed_ptr = dropout2_seed.get_ptr(); - phi::fusion::DropoutParam dropout_param1(dropout1_fix_seed, - 0, - is_test, - is_upscale_in_train1, - dropout1_prob, - dropout1_seed_ptr, - dropout1_seed_val); - phi::fusion::DropoutParam dropout_param2(dropout2_fix_seed, - 0, - is_test, - is_upscale_in_train2, - dropout2_prob, - dropout2_seed_ptr, - dropout2_seed_val); + fusion::DropoutParam dropout_param1(dropout1_fix_seed, + 0, + is_test, + is_upscale_in_train1, + dropout1_prob, + dropout1_seed_ptr, + dropout1_seed_val); + fusion::DropoutParam dropout_param2(dropout2_fix_seed, + 0, + is_test, + is_upscale_in_train2, + dropout2_prob, + dropout2_seed_ptr, + dropout2_seed_val); using U = funcs::LayerNormParamType; dev_ctx.template Alloc(out, out->numel() * sizeof(T)); diff --git a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu index d4adf9c9e9c7d4..43eb6826a0476f 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu @@ -63,7 +63,7 @@ void ComputeMergedQKVMatmulBackward( int n = 3 * config.num_heads * config.head_dim; int k = config.q_dim; auto qkv_compute = - phi::fusion::AttnMatMul(dev_ctx, false, true, m, n, k, false); + fusion::AttnMatMul(dev_ctx, false, true, m, n, k, false); qkv_compute.ComputeBackward(query, qkv_weight, qkv_out_grad, @@ -98,8 +98,8 @@ void ComputeSeparatedQKVMatmulBackward( int kv_m = config.batch_size * config.seq_len_m * config.m_size; int kv_n = config.num_heads * config.head_dim; int kv_k = config.kv_dim; - auto kv_compute = phi::fusion::AttnMatMul( - dev_ctx, false, false, kv_m, kv_n, kv_k, false); + auto kv_compute = + fusion::AttnMatMul(dev_ctx, false, false, kv_m, kv_n, kv_k, false); kv_compute.ComputeBackward( key, key_weight, key_out_grad, key_grad, key_weight_grad, nullptr, false); @@ -123,7 +123,7 @@ void ComputeSeparatedQKVMatmulBackward( int q_n = config.num_heads * config.head_dim; int q_k = config.q_dim; auto q_compute = - phi::fusion::AttnMatMul(dev_ctx, false, false, q_m, q_n, q_k, false); + fusion::AttnMatMul(dev_ctx, false, false, q_m, q_n, q_k, false); q_compute.ComputeBackward(query, query_weight, query_out_grad, @@ -159,7 +159,7 @@ void ComputeGatingLinearBackward( int n = config.num_heads * config.head_dim; int k = config.q_dim; auto gate_linear = - phi::fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); + fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); gate_linear.ComputeForward(gate_weight, query, gate_bias, @@ -211,8 +211,7 @@ void ComputeOutputLinearBackward( int m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.q_dim; int k = config.num_heads * config.head_dim; - auto out_linear = - phi::fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); + auto out_linear = fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); out_linear.ComputeBackward(input, out_linear_weight, out_grad, diff --git a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu index 8c47793dbde80e..5175ede90263ed 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu @@ -52,7 +52,7 @@ void ComputeMergedQKVMatmulForward(const GPUContext &dev_ctx, int n = 3 * config.num_heads * config.head_dim; int k = config.q_dim; auto qkv_compute = - phi::fusion::AttnMatMul(dev_ctx, false, true, m, n, k, false); + fusion::AttnMatMul(dev_ctx, false, true, m, n, k, false); qkv_compute.ComputeForward(qkv_weight, query, nullptr, qkv_out, nullptr); } @@ -80,7 +80,7 @@ void ComputeSeparatedQKVMatmulForward( int q_n = config.num_heads * config.head_dim; int q_k = config.q_dim; auto q_compute = - phi::fusion::AttnMatMul(dev_ctx, false, false, q_m, q_n, q_k, false); + fusion::AttnMatMul(dev_ctx, false, false, q_m, q_n, q_k, false); q_compute.ComputeForward(query_weight, query, nullptr, query_out, nullptr); // k_out = GEMM(key, key_weight) @@ -90,8 +90,8 @@ void ComputeSeparatedQKVMatmulForward( int kv_m = config.batch_size * config.seq_len_m * config.m_size; int kv_n = config.num_heads * config.head_dim; int kv_k = config.kv_dim; - auto kv_compute = phi::fusion::AttnMatMul( - dev_ctx, false, false, kv_m, kv_n, kv_k, false); + auto kv_compute = + fusion::AttnMatMul(dev_ctx, false, false, kv_m, kv_n, kv_k, false); kv_compute.ComputeForward(key_weight, key, nullptr, key_out, nullptr); // value_out = GEMM(value, value_weight) @@ -118,7 +118,7 @@ void ComputeGatingLinearForward(const GPUContext &dev_ctx, int n = config.num_heads * config.head_dim; int k = config.q_dim; auto gate_linear = - phi::fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); + fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); gate_linear.ComputeForward(gate_weight, query, gate_bias, @@ -147,8 +147,7 @@ void ComputeOutputLinearForward(const GPUContext &dev_ctx, int m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.q_dim; int k = config.num_heads * config.head_dim; - auto out_linear = - phi::fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); + auto out_linear = fusion::AttnMatMul(dev_ctx, false, false, m, n, k, true); out_linear.ComputeForward(out_linear_weight, fmha_or_gate_out, out_linear_bias, diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu index 4608f871f25ad2..28abc332f56311 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu @@ -1064,11 +1064,11 @@ void FusedLayerNormKernel(const Context& dev_ctx, cols *= x.dims()[i]; } - phi::fusion::DropoutParam dropout_param(true, 0, true, true, 0.0, nullptr, 0); - phi::fusion::FusedDropoutLayerNormHelper + fusion::DropoutParam dropout_param(true, 0, true, true, 0.0, nullptr, 0); + fusion::FusedDropoutLayerNormHelper residual_bias_add_layernorm_helper( dev_ctx, rows, cols, dropout_param, epsilon, residual_alpha); - phi::fusion::AttnLayerNorm layernorm_helper(dev_ctx, epsilon, rows, cols); + fusion::AttnLayerNorm layernorm_helper(dev_ctx, epsilon, rows, cols); // Do residual + bias + x if (residual && norm_weight_data == nullptr && norm_bias_data == nullptr) { diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h index f1fe1d35c4a494..8dd3ddc2b567c1 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h @@ -75,14 +75,14 @@ __device__ void CalcLayernormY( static_cast>(0); } // vectorize load data from global - phi::Load(&x[row_id * cols + i], &x_vec); + Load(&x[row_id * cols + i], &x_vec); if (scale != nullptr) { - phi::Load, VecSize>( + Load, VecSize>( &scale[i], &scale_vec); } if (bias != nullptr) { - phi::Load, VecSize>( + Load, VecSize>( &bias[i], &bias_vec); } @@ -93,7 +93,7 @@ __device__ void CalcLayernormY( (static_cast(x_vec[ii]) - mean_val) * invvar + static_cast(bias_vec[ii])); } - phi::Store(y_vec, &y[row_id * cols + i]); + Store(y_vec, &y[row_id * cols + i]); } } @@ -595,7 +595,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( if (bias_ptr != nullptr) { #pragma unroll for (int64_t it = 0, col = c; it < LDGS; it++) { - phi::Load(bias_ptr + col * VecSize, &bias[it]); + Load(bias_ptr + col * VecSize, &bias[it]); col += THREADS_PER_ROW; } } @@ -604,8 +604,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int64_t it = 0, col = c; it < LDGS; it++) { - phi::Load(gamma_ptr + col * VecSize, &gamma[it]); - phi::Load(beta_ptr + col * VecSize, &beta[it]); + Load(gamma_ptr + col * VecSize, &gamma[it]); + Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -620,11 +620,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( #pragma unroll for (int64_t it = 0, col = c; it < LDGS; it++) { int64_t index = row * ELTS_PER_ROW + col * VecSize; - phi::Load(residual_ptr + index, &residual[it]); - phi::Load(x_ptr + index, &x_input[it]); + Load(residual_ptr + index, &residual[it]); + Load(x_ptr + index, &x_input[it]); if (quant_out_scale_ptr != nullptr) { - phi::Load(quant_out_scale_ptr + col * VecSize, - &dequant_out_scale[it]); + Load(quant_out_scale_ptr + col * VecSize, + &dequant_out_scale[it]); } col += THREADS_PER_ROW; } @@ -704,14 +704,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { int64_t index = row * ELTS_PER_ROW + col * VecSize; - phi::Store(x[it], residual_out_ptr + index); + Store(x[it], residual_out_ptr + index); col += THREADS_PER_ROW; } if (!is_test && HasDropout) { #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { int64_t index = row * ELTS_PER_ROW + col * VecSize; - phi::Store(mask_vec[it], mask_out_ptr + index); + Store(mask_vec[it], mask_out_ptr + index); col += THREADS_PER_ROW; } } @@ -823,9 +823,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( for (int64_t it = 0, col = c; it < LDGS; it++) { int64_t index = row * ELTS_PER_ROW + col * VecSize; if (std::is_same::value) { - phi::Store(x_output[it], y_ptr + index); + Store(x_output[it], y_ptr + index); } else { - phi::Store(x[it], reinterpret_cast(y_ptr) + index); + Store(x[it], reinterpret_cast(y_ptr) + index); } col += THREADS_PER_ROW; } diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_helper.cu.h b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_helper.cu.h index 7628773e0909bb..0e37ab91373fcb 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_helper.cu.h +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_helper.cu.h @@ -132,13 +132,13 @@ class GEMMHelper { using NvType = typename phi::PDDataTypeTraits::DataType; if (gemm_method_ == "None") { - auto ffn_linear_compute = phi::fusion::AttnMatMul(dev_ctx_, - false, - transpose_weight_, - token_num_, - dim_ffn_, - dim_embed_, - compute_bias); + auto ffn_linear_compute = fusion::AttnMatMul(dev_ctx_, + false, + transpose_weight_, + token_num_, + dim_ffn_, + dim_embed_, + compute_bias); ffn_linear_compute.ComputeForward(weight, input, bias, output, output); } else { PADDLE_THROW(common::errors::Unimplemented( @@ -174,10 +174,9 @@ class NormHelper { // Layernorm. Need support rmsnorm. layernorm_helper_(dev_ctx_, epsilon_, rows_, cols_) { // VLOG(0) << "NormHelper residual_alpha:" << residual_alpha_; - phi::fusion::DropoutParam dropout_param( - true, 0, true, true, 0.0, nullptr, 0); + fusion::DropoutParam dropout_param(true, 0, true, true, 0.0, nullptr, 0); residual_bias_add_layernorm_helper_ = - phi::fusion::FusedDropoutLayerNormHelper( + fusion::FusedDropoutLayerNormHelper( dev_ctx, rows_, cols_, dropout_param, epsilon_); } @@ -293,7 +292,7 @@ class NormHelper { int64_t cols_; float epsilon_; float residual_alpha_; - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper residual_bias_add_layernorm_helper_; AttnLayerNorm layernorm_helper_; }; diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_int8_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_int8_kernel.cu index ecccffb44a135d..89b7bec19a22f0 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_int8_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_int8_kernel.cu @@ -92,8 +92,8 @@ void FusedMultiTransformerINT8OpKernel( auto ln_scales = ln_scale; auto ln_biases = ln_bias; - auto ln_compute = phi::fusion::AttnLayerNorm( - dev_ctx, epsilon, bsz_seq, dim_embed); + auto ln_compute = + fusion::AttnLayerNorm(dev_ctx, epsilon, bsz_seq, dim_embed); DenseTensor ln_mean, ln_var; ln_mean.Resize({bsz_seq}); auto *ln_mean_data = @@ -117,7 +117,7 @@ void FusedMultiTransformerINT8OpKernel( bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr; // (transA, transB, compute_bias) = (false, trans_qkvw, false) - phi::fusion::AttnMatmulINT8 qkv_compute( + fusion::AttnMatmulINT8 qkv_compute( dev_ctx, bsz_seq, output_size, input_size, compute_bias); DenseTensor qkv_out; qkv_out.Resize({bsz, seq_len, 3, num_head, dim_head}); @@ -125,10 +125,10 @@ void FusedMultiTransformerINT8OpKernel( dev_ctx.template Alloc(&qkv_out, qkv_out.numel() * sizeof(T)); // 3. fmha - phi::fusion::AttnDropoutParam attn_param( + fusion::AttnDropoutParam attn_param( true, "upscale_in_train", 0.0, true, true, 0, nullptr); - auto fmha_compute = phi::fusion::FMHARef( - dev_ctx, bsz, seq_len, num_head, dim_head, attn_param); + auto fmha_compute = + fusion::FMHARef(dev_ctx, bsz, seq_len, num_head, dim_head, attn_param); auto *src_mask = src_mask_in.get_ptr(); auto cache_kvs = std::vector(); if (cache_kv_in) { @@ -193,16 +193,15 @@ void FusedMultiTransformerINT8OpKernel( auto out_linear_biases = out_linear_bias.get(); // (transA, transB, compute_bias) = (false, false, false) - phi::fusion::AttnMatmulINT8 out_linear_compute( + fusion::AttnMatmulINT8 out_linear_compute( dev_ctx, bsz_seq, dim_embed, hidden_size, false); // 5. ln(residual + bias) - phi::fusion::DropoutParam dropout_param2( - true, 0, true, true, 0.0, nullptr, 0); - phi::fusion::FusedDropoutLayerNormHelper + fusion::DropoutParam dropout_param2(true, 0, true, true, 0.0, nullptr, 0); + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper fused_dropout_layernorm_helper_for_post_layernorm( dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon); auto ffn_ln_scales = ffn_ln_scale; @@ -225,7 +224,7 @@ void FusedMultiTransformerINT8OpKernel( auto ffn1_weight_dim = ffn1_weights[0]->dims(); int dim_ffn = ffn1_weight_dim[0]; - phi::fusion::AttnMatmulINT8 ffn1_linear_compute( + fusion::AttnMatmulINT8 ffn1_linear_compute( dev_ctx, bsz_seq, dim_ffn, dim_embed, false); DenseTensor ffn1_out; ffn1_out.Resize({bsz_seq, dim_ffn}); @@ -233,11 +232,10 @@ void FusedMultiTransformerINT8OpKernel( dev_ctx.template Alloc(&ffn1_out, ffn1_out.numel() * sizeof(T)); // 7. ffn act + bias - phi::fusion::DropoutParam ffn1_dropout_param( - true, 0, true, true, 0.0, nullptr, 0); - phi::fusion::FusedDropoutHelper + fusion::DropoutParam ffn1_dropout_param(true, 0, true, true, 0.0, nullptr, 0); + fusion::FusedDropoutHelper fused_act_dropout_helper(dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param); - phi::fusion::FusedDropoutHelper + fusion::FusedDropoutHelper fused_act_dropout_helper_for_post_layernorm( dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param); DenseTensor ffn1_dropout_out, ffn1_dropout_mask; @@ -251,19 +249,18 @@ void FusedMultiTransformerINT8OpKernel( // 8. ffn2 matmul auto ffn2_weights = ffn2_weight; auto ffn2_biases = ffn2_bias.get(); - phi::fusion::AttnMatmulINT8 ffn2_linear_compute( + fusion::AttnMatmulINT8 ffn2_linear_compute( dev_ctx, bsz_seq, dim_embed, dim_ffn, false); // 9. ffn2 residual bias - phi::fusion::DropoutParam ffn2_dropout_param( - true, 0, true, true, 0.0, nullptr, 0); - phi::fusion::FusedDropoutLayerNormHelper + fusion::DropoutParam ffn2_dropout_param(true, 0, true, true, 0.0, nullptr, 0); + fusion::FusedDropoutLayerNormHelper ffn2_fused_dropout_helper( dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper ffn2_fused_dropout_dequant_helper( dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon); - phi::fusion::FusedDropoutLayerNormHelper + fusion::FusedDropoutLayerNormHelper ffn2_fused_dropout_helper_for_post_layernorm( dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon); diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu index cad86b5711f038..4de9894fcf6df9 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu @@ -232,7 +232,7 @@ void FusedMultiTransformerOpKernel( auto *rotary_tensor = rotary_tensor_in.get_ptr(); // 3. fmha - phi::fusion::AttnDropoutParam attn_param( + fusion::AttnDropoutParam attn_param( true, "upscale_in_train", 0.0, true, true, 0, nullptr); auto *src_mask = src_mask_in.get_ptr(); @@ -416,9 +416,8 @@ void FusedMultiTransformerOpKernel( char *mixgemm_workspace_data = nullptr; // 7. ffn act + bias - phi::fusion::DropoutParam ffn1_dropout_param( - true, 0, true, true, 0.0, nullptr, 0); - phi::fusion::FusedDropoutHelper fused_act_dropout_helper( + fusion::DropoutParam ffn1_dropout_param(true, 0, true, true, 0.0, nullptr, 0); + fusion::FusedDropoutHelper fused_act_dropout_helper( dev_ctx, token_num, dim_ffn, ffn1_dropout_param); DenseTensor ffn1_dropout_out, ffn1_dropout_mask; int tmp_dim_ffn = dim_ffn; @@ -439,11 +438,9 @@ void FusedMultiTransformerOpKernel( dev_ctx, token_num, dim_embed, tmp_dim_ffn, "None", false); // 9. ffn2 residual bias - phi::fusion::DropoutParam ffn2_dropout_param( - true, 0, true, true, 0.0, nullptr, 0); - phi::fusion::FusedDropoutLayerNormHelper - ffn2_fused_dropout_helper( - dev_ctx, token_num, dim_embed, ffn2_dropout_param, epsilon); + fusion::DropoutParam ffn2_dropout_param(true, 0, true, true, 0.0, nullptr, 0); + fusion::FusedDropoutLayerNormHelper ffn2_fused_dropout_helper( + dev_ctx, token_num, dim_embed, ffn2_dropout_param, epsilon); DenseTensor tmp_out, tmp_out_rm_padding; tmp_out.Resize({token_num, dim_embed}); diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h index 37bba718254417..d8a021aff7cdda 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h @@ -1959,7 +1959,7 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf, step = gridDim.x * blockDim.x * VecSize; linear_index < elem_cnt; linear_index += step) { - phi::Load(&qkv[linear_index], &src_vec); + Load(&qkv[linear_index], &src_vec); int32_t bias_idx = linear_index % fused_hidden_size; const int32_t token_idx = linear_index / fused_hidden_size; const int32_t ori_token_idx = @@ -1974,11 +1974,11 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf, const int32_t write_idx = token_idx * hidden_size + head_id * size_per_head + size_id; if (qkv_id == 0) { - phi::Store(src_vec, &q_buf[write_idx]); + Store(src_vec, &q_buf[write_idx]); } else if (qkv_id == 1) { - phi::Store(src_vec, &k_buf[write_idx]); + Store(src_vec, &k_buf[write_idx]); } else { - phi::Store(src_vec, &v_buf[write_idx]); + Store(src_vec, &v_buf[write_idx]); } } } @@ -2050,10 +2050,10 @@ __global__ void add_fusedQKV_bias_transpose_split_kernel( step = gridDim.x * blockDim.x * VecSize; linear_index < elem_cnt; linear_index += step) { - phi::Load(&qkv[linear_index], &src_vec); + Load(&qkv[linear_index], &src_vec); int32_t bias_idx = linear_index % fused_hidden_size; if (ComputeBias) { - phi::Load(&qkv_bias[bias_idx], &bias_vec); + Load(&qkv_bias[bias_idx], &bias_vec); #pragma unroll for (int32_t unroll_idx = 0; unroll_idx < VecSize; unroll_idx++) { src_vec[unroll_idx] += bias_vec[unroll_idx]; @@ -2072,14 +2072,14 @@ __global__ void add_fusedQKV_bias_transpose_split_kernel( const int32_t size_id = linear_index % size_per_head; if (qkv_id == 0) { - phi::Store( + Store( src_vec, &q_buf[target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head + seq_id * size_per_head + size_id]); } else { const int32_t kv_store_offset = (qkv_id - 1) * offset; - phi::Store( + Store( src_vec, &kv_buf[kv_store_offset + target_batch_id * head_num * seq_len * size_per_head + @@ -2186,7 +2186,7 @@ __global__ void gqa_fusedQKV_transpose_split_kernel(T *q_buf, step = gridDim.x * blockDim.x * VecSize; linear_index < elem_cnt; linear_index += step) { - phi::Load(&qkv[linear_index], &src_vec); + Load(&qkv[linear_index], &src_vec); int32_t bias_idx = linear_index % fused_hidden_size; const int32_t token_idx = linear_index / fused_hidden_size; const int32_t ori_token_idx = @@ -2201,18 +2201,18 @@ __global__ void gqa_fusedQKV_transpose_split_kernel(T *q_buf, if (head_id < head_num) { const int32_t write_idx = token_idx * head_num * size_per_head + head_id * size_per_head + size_id; - phi::Store(src_vec, &q_buf[write_idx]); + Store(src_vec, &q_buf[write_idx]); } else { if (head_id < head_num + gqa_group_size) { const int32_t write_idx = token_idx * gqa_group_size * size_per_head + (head_id - head_num) * size_per_head + size_id; - phi::Store(src_vec, &k_buf[write_idx]); + Store(src_vec, &k_buf[write_idx]); } else { const int32_t write_idx = token_idx * gqa_group_size * size_per_head + (head_id - head_num - gqa_group_size) * size_per_head + size_id; - phi::Store(src_vec, &v_buf[write_idx]); + Store(src_vec, &v_buf[write_idx]); } } } @@ -2590,16 +2590,16 @@ __global__ void ActFFNGlu(const T *bias, int idx = i % hid_dim; // const T *input_this_thread = input + bi * hid_dim * 2; // T *output_this_thread = output + bi * hid_dim; - // phi::Load(&input_this_thread[idx], &src_vec1); - // phi::Load(&input_this_thread[idx + hid_dim], &src_vec2); + // Load(&input_this_thread[idx], &src_vec1); + // Load(&input_this_thread[idx + hid_dim], &src_vec2); load_func.template load(&src_vec1, bi * hid_dim * 2 + idx); load_func.template load(&src_vec2, bi * hid_dim * 2 + idx + hid_dim); if (bias) { - phi::Load(&bias[idx], &bias_vec1); - phi::Load(&bias[idx + hid_dim], &bias_vec2); + Load(&bias[idx], &bias_vec1); + Load(&bias[idx + hid_dim], &bias_vec2); } #pragma unroll for (int j = 0; j < VecSize; j++) { @@ -2610,7 +2610,7 @@ __global__ void ActFFNGlu(const T *bias, src_vec1[j] = act_functor(src_vec1[j]); src_vec1[j] *= src_vec2[j]; } - // phi::Store(src_vec1, &output_this_thread[idx]); + // Store(src_vec1, &output_this_thread[idx]); store_func.template store(src_vec1, bi * hid_dim + idx); } } @@ -2680,10 +2680,10 @@ __global__ void BiasAct(const T *bias, int row_idx = i / cols; int col_idx = i % cols; int linear_idx = row_idx * cols + col_idx; - // phi::Load(&input[linear_idx], &src_vec); + // Load(&input[linear_idx], &src_vec); load_func.template load(&src_vec, linear_idx); if (bias) { - phi::Load(&bias[col_idx], &bias_vec); + Load(&bias[col_idx], &bias_vec); } #pragma unroll for (int j = 0; j < VecSize; j++) { @@ -2692,7 +2692,7 @@ __global__ void BiasAct(const T *bias, } src_vec[j] = act_functor(src_vec[j]); } - // phi::Store(src_vec, &output[linear_idx]); + // Store(src_vec, &output[linear_idx]); store_func.template store(src_vec, linear_idx); } } @@ -2782,30 +2782,28 @@ __global__ void fused_transpose_split_kernel( const int32_t size_id = linear_index % size_per_head; if (qkv_id == 0) { // read q - phi::Load( - &q_input[target_batch_id * head_num * max_len_this_time * - size_per_head + - head_id * max_len_this_time * size_per_head + - seq_id * size_per_head + size_id], - &src_vec); + Load(&q_input[target_batch_id * head_num * max_len_this_time * + size_per_head + + head_id * max_len_this_time * size_per_head + + seq_id * size_per_head + size_id], + &src_vec); } else { // read k/v const int32_t kv_store_offset = (qkv_id - 1) * offset; - phi::Load( - &kv_input[kv_store_offset + - target_batch_id * head_num * max_len_this_time * - size_per_head + - head_id * max_len_this_time * size_per_head + - seq_id * size_per_head + size_id], - &src_vec); + Load(&kv_input[kv_store_offset + + target_batch_id * head_num * + max_len_this_time * size_per_head + + head_id * max_len_this_time * size_per_head + + seq_id * size_per_head + size_id], + &src_vec); } int32_t write_index = linear_index - (qkv_id + 2 * current_token) * hidden_size; if (qkv_id == 0) { - phi::Store(src_vec, &q_out[write_index]); + Store(src_vec, &q_out[write_index]); } else if (qkv_id == 1) { - phi::Store(src_vec, &k_out[write_index]); + Store(src_vec, &k_out[write_index]); } else if (qkv_id == 2) { - phi::Store(src_vec, &v_out[write_index]); + Store(src_vec, &v_out[write_index]); } } } @@ -2930,10 +2928,10 @@ __global__ void VariableLengthRotaryKernel( ori_bi * seq_len * last_dim + ori_seq_id * last_dim + h_bias; const int64_t bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias; const int64_t base_idx = token_idx * 3 * hidden_size + bias_idx; - phi::Load(&qkv[base_idx], &src_vec); - phi::Load(&qkv_biases[bias_idx], &bias_vec); - phi::Load(&cos_emb[emb_idx], &cos_emb_vec); - phi::Load(&sin_emb[emb_idx], &sin_emb_vec); + Load(&qkv[base_idx], &src_vec); + Load(&qkv_biases[bias_idx], &bias_vec); + Load(&cos_emb[emb_idx], &cos_emb_vec); + Load(&sin_emb[emb_idx], &sin_emb_vec); #pragma unroll for (int i = 0; i < HalfVecSize; i++) { const float input_left = @@ -2958,7 +2956,7 @@ __global__ void VariableLengthRotaryKernel( src_vec[2 * i + 1] = static_cast(input_right); } } - phi::Store(src_vec, &qkv_out[base_idx]); + Store(src_vec, &qkv_out[base_idx]); } } @@ -3045,10 +3043,10 @@ __global__ void GQAVariableLengthRotaryKernel( ori_bi * seq_len * last_dim + ori_seq_id * last_dim + h_bias; const int64_t bias_idx = hi * last_dim + h_bias; const int64_t base_idx = token_idx * offset + bias_idx; - phi::Load(&qkv[base_idx], &src_vec); - phi::Load(&qkv_biases[bias_idx], &bias_vec); - phi::Load(&cos_emb[emb_idx], &cos_emb_vec); - phi::Load(&sin_emb[emb_idx], &sin_emb_vec); + Load(&qkv[base_idx], &src_vec); + Load(&qkv_biases[bias_idx], &bias_vec); + Load(&cos_emb[emb_idx], &cos_emb_vec); + Load(&sin_emb[emb_idx], &sin_emb_vec); #pragma unroll for (int i = 0; i < HalfVecSize; i++) { const float input_left = @@ -3073,7 +3071,7 @@ __global__ void GQAVariableLengthRotaryKernel( src_vec[2 * i + 1] = static_cast(input_right); } } - phi::Store(src_vec, &qkv_out[base_idx]); + Store(src_vec, &qkv_out[base_idx]); } } diff --git a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h index e4d9da8723e0af..441bd76f9658bd 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h @@ -79,15 +79,14 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( residual_vec[ii] = static_cast(0); } // vectorize load data from global - phi::Load(&src[row_id * cols + col_id], &src_vec); - phi::Load(&dequant_out_scale_data[col_id], - &quant_out_scale_vec); + Load(&src[row_id * cols + col_id], &src_vec); + Load(&dequant_out_scale_data[col_id], &quant_out_scale_vec); if (residual) { - phi::Load(&residual[row_id * cols + col_id], &residual_vec); + Load(&residual[row_id * cols + col_id], &residual_vec); } if (bias) { - phi::Load(&bias[col_id], &bias_vec); + Load(&bias[col_id], &bias_vec); } MaskStoreT mask_vec; @@ -144,14 +143,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( // store result to global if (std::is_same::value) { - phi::Store(dest_vec_out_type, - &dst[row_id * cols + col_id]); + Store(dest_vec_out_type, &dst[row_id * cols + col_id]); } else { - phi::Store(dest_vec, - reinterpret_cast(&dst[row_id * cols + col_id])); + Store(dest_vec, + reinterpret_cast(&dst[row_id * cols + col_id])); } if (!is_test && HasDropout) { - phi::Store(mask_vec, &mask[row_id * cols + col_id]); + Store(mask_vec, &mask[row_id * cols + col_id]); } } @@ -191,9 +189,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, LoadT out_vec; MaskLoadT mask_vec; StoreT dx_vec; - phi::Load(&dout[index], &out_vec); + Load(&dout[index], &out_vec); if (HasDropout) { - phi::Load(&mask[index], &mask_vec); + Load(&mask[index], &mask_vec); } if (not_need_dx) { @@ -211,7 +209,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, } tmp_sum[i] += out_vec[i]; } - phi::Store(dx_vec, &dx[index]); + Store(dx_vec, &dx[index]); } } } @@ -238,15 +236,15 @@ __global__ void FusedResidualDropoutGrad(const T *dout, for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; MaskLoadT mask_vec; - phi::Load(&dout[i], &dout_vec); - phi::Load(&mask[i], &mask_vec); + Load(&dout[i], &dout_vec); + Load(&mask[i], &mask_vec); StoreT dx_vec; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; } - phi::Store(dx_vec, &dx[i]); + Store(dx_vec, &dx[i]); } } diff --git a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h index e8bb92e960d52c..336fba5f320335 100644 --- a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h +++ b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h @@ -3739,7 +3739,7 @@ struct MMHAStore { for (int i = 0; i < VecSize; i++) { src_vec[i] = static_cast(static_cast(src_vec[i]) * scale); } - phi::Store(src_vec, dst_ + idx); + Store(src_vec, dst_ + idx); } StoreT* dst_; @@ -3759,15 +3759,15 @@ struct MMHAStore { TVec smooth_vec; *reinterpret_cast(&src_vec) = src; - phi::Load(shift_ + idx % cols_, &shift_vec); - phi::Load(smooth_ + idx % cols_, &smooth_vec); + Load(shift_ + idx % cols_, &shift_vec); + Load(smooth_ + idx % cols_, &smooth_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { src_vec[i] = (src_vec[i] + shift_vec[i]) * smooth_vec[i]; } - phi::Store(src_vec, dst_ + idx); + Store(src_vec, dst_ + idx); } T* dst_; @@ -3792,8 +3792,8 @@ struct MMHALoad { DstVec dst_vec; ScaleVec scale_vec; - phi::Load(src_ + idx, &src_vec); - phi::Load(dequant_scales_ + idx % cols_, &scale_vec); + Load(src_ + idx, &src_vec); + Load(dequant_scales_ + idx % cols_, &scale_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { dst_vec[i] = @@ -3840,7 +3840,7 @@ struct MMHAStore { quant_min_bound_); } - phi::Store(dst_vec, dst_ + idx); + Store(dst_vec, dst_ + idx); } int8_t* dst_; @@ -3881,8 +3881,8 @@ struct MMHAStore { SrcVec smooth_vec; *reinterpret_cast(&src_vec) = src; - phi::Load(shift_ + idx % cols_, &shift_vec); - phi::Load(smooth_ + idx % cols_, &smooth_vec); + Load(shift_ + idx % cols_, &shift_vec); + Load(smooth_ + idx % cols_, &smooth_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { @@ -3895,7 +3895,7 @@ struct MMHAStore { quant_min_bound_); } - phi::Store(dst_vec, dst_ + idx); + Store(dst_vec, dst_ + idx); } int8_t* dst_; diff --git a/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h b/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h index 1e313e4f6f21ef..e4f67973105ebd 100644 --- a/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h +++ b/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h @@ -127,8 +127,8 @@ __global__ void DequantKernel(T* output, AlignedVector out_vec; for (; idx < numel; idx += stride) { - phi::Load(input + idx, &in_vec); - phi::Load(dequant_out_scale_data + col_id, &out_scale_vec); + Load(input + idx, &in_vec); + Load(dequant_out_scale_data + col_id, &out_scale_vec); #pragma unroll for (int i = 0; i < VecSize; ++i) { @@ -136,7 +136,7 @@ __global__ void DequantKernel(T* output, static_cast(static_cast(in_vec[i]) * out_scale_vec[i]); } - phi::Store(out_vec, output + idx); + Store(out_vec, output + idx); } }