"vscode:/vscode.git/clone" did not exist on "07b297e7dec3f475a880c15addcb6c02c0690992"
Commit 92b9b046 authored by ltqin's avatar ltqin
Browse files

add from botton right mask

parent 41c659bb
...@@ -24,7 +24,7 @@ Kernel outputs: ...@@ -24,7 +24,7 @@ Kernel outputs:
*/ */
#define PRINT_HOST 0 #define PRINT_HOST 0
#define USING_MASK 0 #define USING_MASK 1
#define DIM 128 // DIM should be a multiple of 8. #define DIM 128 // DIM should be a multiple of 8.
#include <iostream> #include <iostream>
...@@ -85,7 +85,7 @@ static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8; ...@@ -85,7 +85,7 @@ static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
#if USING_MASK #if USING_MASK
static constexpr auto MaskingSpec = static constexpr auto MaskingSpec =
ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle; ck::tensor_operation::device::MaskingSpecialization::MaskUpperTringleFromBottonRight;
#else #else
static constexpr auto MaskingSpec = static constexpr auto MaskingSpec =
ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
...@@ -227,8 +227,9 @@ void run_attention_fwd_host(const TensorQ& q_g_m_k, ...@@ -227,8 +227,9 @@ void run_attention_fwd_host(const TensorQ& q_g_m_k,
ref_gemm0_invoker.Run(ref_gemm0_argument); ref_gemm0_invoker.Run(ref_gemm0_argument);
// masking // masking
auto M = s_g_m_n.GetLengths()[1];
auto N = s_g_m_n.GetLengths()[2]; auto N = s_g_m_n.GetLengths()[2];
const auto mask = DeviceGemmInstance::C0MatrixMask(N); const auto mask = DeviceGemmInstance::C0MatrixMask(M, N);
s_g_m_n.ForEach([&](auto& self, auto idx) { s_g_m_n.ForEach([&](auto& self, auto idx) {
if(mask.IsMaskedElement(idx[1], idx[2])) if(mask.IsMaskedElement(idx[1], idx[2]))
self(idx) = -ck::NumericLimits<float>::Infinity(); self(idx) = -ck::NumericLimits<float>::Infinity();
...@@ -267,7 +268,7 @@ int run(int argc, char* argv[]) ...@@ -267,7 +268,7 @@ int run(int argc, char* argv[])
// y_g_m_o = Softmax(alpha * Q_g_m_k * K_g_k_n) * V_g_n_o // y_g_m_o = Softmax(alpha * Q_g_m_k * K_g_k_n) * V_g_n_o
// y_g0_g1_m_o = reshape(y_g_m_o, [G0, G1, M, O]) // y_g0_g1_m_o = reshape(y_g_m_o, [G0, G1, M, O])
// y_g0_m_g1_o = permute(y_g0_g1_m_o, [0, 2, 1, 3]) // y_g0_m_g1_o = permute(y_g0_g1_m_o, [0, 2, 1, 3])
ck::index_t M = 512; ck::index_t M = 123;
ck::index_t N = 512; ck::index_t N = 512;
ck::index_t K = DIM; ck::index_t K = DIM;
ck::index_t O = DIM; ck::index_t O = DIM;
......
...@@ -156,36 +156,37 @@ __global__ void ...@@ -156,36 +156,37 @@ __global__ void
} }
else else
{ {
GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset, GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
p_b_grid + b_batch_offset, p_a_grid + a_batch_offset,
z_matrix_ptr, p_b_grid + b_batch_offset,
p_b1_grid + b1_batch_offset, z_matrix_ptr,
p_c_grid + c_batch_offset, p_b1_grid + b1_batch_offset,
p_lse_grid + lse_batch_offset, p_c_grid + c_batch_offset,
p_ygrad_grid + c_batch_offset, p_lse_grid + lse_batch_offset,
p_qgrad_grid + a_batch_offset, p_ygrad_grid + c_batch_offset,
p_kgrad_grid + b_batch_offset, p_qgrad_grid + a_batch_offset,
p_vgrad_grid + b1_batch_offset, p_kgrad_grid + b_batch_offset,
p_shared, p_vgrad_grid + b1_batch_offset,
a_element_op, p_shared,
b_element_op, a_element_op,
acc_element_op, b_element_op,
b1_element_op, acc_element_op,
c_element_op, b1_element_op,
a_grid_desc_ak0_m_ak1, c_element_op,
b_grid_desc_bk0_n_bk1, a_grid_desc_ak0_m_ak1,
c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5, b_grid_desc_bk0_n_bk1,
b1_grid_desc_bk0_n_bk1, c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
c_grid_desc_mblock_mperblock_nblock_nperblock, b1_grid_desc_bk0_n_bk1,
lse_grid_desc_m, c_grid_desc_mblock_mperblock_nblock_nperblock,
ygrad_grid_desc_o0_m_o1, lse_grid_desc_m,
block_2_ctile_map, ygrad_grid_desc_o0_m_o1,
c0_matrix_mask, block_2_ctile_map,
p_drop, c0_matrix_mask,
ph, p_drop,
z_random_matrix_offset, ph,
raw_n_padded, z_random_matrix_offset,
0); raw_n_padded,
0);
} }
#else #else
ignore = p_a_grid; ignore = p_a_grid;
...@@ -1000,10 +1001,15 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -1000,10 +1001,15 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
arg.m_raw_padded_, arg.m_raw_padded_,
arg.n_raw_padded_); arg.n_raw_padded_);
}; };
if(arg.p_drop_ > 0.0){ if(arg.p_drop_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); {
}else{ ave_time = launch_kernel(integral_constant<bool, false>{},
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); integral_constant<bool, true>{});
}
else
{
ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
return ave_time; return ave_time;
} }
......
...@@ -155,36 +155,37 @@ __global__ void ...@@ -155,36 +155,37 @@ __global__ void
} }
else else
{ {
GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset, GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
p_b_grid + b_batch_offset, p_a_grid + a_batch_offset,
z_matrix_ptr, p_b_grid + b_batch_offset,
p_b1_grid + b1_batch_offset, z_matrix_ptr,
p_c_grid + c_batch_offset, p_b1_grid + b1_batch_offset,
p_lse_grid + lse_batch_offset, p_c_grid + c_batch_offset,
p_ygrad_grid + c_batch_offset, p_lse_grid + lse_batch_offset,
p_qgrad_grid + a_batch_offset, p_ygrad_grid + c_batch_offset,
p_kgrad_grid + b_batch_offset, p_qgrad_grid + a_batch_offset,
p_vgrad_grid + b1_batch_offset, p_kgrad_grid + b_batch_offset,
p_shared, p_vgrad_grid + b1_batch_offset,
a_element_op, p_shared,
b_element_op, a_element_op,
acc_element_op, b_element_op,
b1_element_op, acc_element_op,
c_element_op, b1_element_op,
a_grid_desc_ak0_m_ak1, c_element_op,
b_grid_desc_bk0_n_bk1, a_grid_desc_ak0_m_ak1,
c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3, b_grid_desc_bk0_n_bk1,
b1_grid_desc_bk0_n_bk1, c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
c_grid_desc_mblock_mperblock_nblock_nperblock, b1_grid_desc_bk0_n_bk1,
lse_grid_desc_m, c_grid_desc_mblock_mperblock_nblock_nperblock,
ygrad_grid_desc_m0_o_m1, lse_grid_desc_m,
block_2_ctile_map, ygrad_grid_desc_m0_o_m1,
c0_matrix_mask, block_2_ctile_map,
p_drop, c0_matrix_mask,
ph, p_drop,
z_random_matrix_offset, ph,
raw_n_padded, z_random_matrix_offset,
0); raw_n_padded,
0);
} }
#else #else
ignore = p_a_grid; ignore = p_a_grid;
...@@ -574,6 +575,10 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -574,6 +575,10 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
{ {
return MaskOutUpperTrianglePredicate{}; return MaskOutUpperTrianglePredicate{};
} }
else if constexpr(MaskingSpec == MaskingSpecialization::MaskUpperTringleFromBottonRight)
{
return MaskUpperTringleFromBottonRightPredicate{};
}
} }
using C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())>; using C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())>;
...@@ -786,7 +791,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -786,7 +791,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
acc_element_op_{acc_element_op}, acc_element_op_{acc_element_op},
b1_element_op_{b1_element_op}, b1_element_op_{b1_element_op},
c_element_op_{c_element_op}, c_element_op_{c_element_op},
c0_matrix_mask_{b_grid_desc_g_n_k_.GetLength(I1)}, c0_matrix_mask_{a_grid_desc_g_m_k_.GetLength(I1), b_grid_desc_g_n_k_.GetLength(I1)},
raw_lengths_mz_nz_kz_gemm1nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1], raw_lengths_mz_nz_kz_gemm1nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1],
b_gs_ns_ks_lengths[NumDimG + NumDimN - 1], b_gs_ns_ks_lengths[NumDimG + NumDimN - 1],
b_gs_ns_ks_lengths[NumDimG + NumDimN + NumDimK - 1], b_gs_ns_ks_lengths[NumDimG + NumDimN + NumDimK - 1],
...@@ -1024,16 +1029,20 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1024,16 +1029,20 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
{ {
if(arg.p_drop_ > 0.0) if(arg.p_drop_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, false>{});
} }
else else
{ {
if(arg.p_drop_ > 0.0) if(arg.p_drop_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
return ave_time; return ave_time;
......
...@@ -999,16 +999,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -999,16 +999,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
if(all_has_main_k_block_loop) if(all_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, false>{});
} }
else if(!some_has_main_k_block_loop) else if(!some_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
else else
{ {
......
...@@ -1006,16 +1006,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1006,16 +1006,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
if(all_has_main_k_block_loop) if(all_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, false>{});
} }
else if(!some_has_main_k_block_loop) else if(!some_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
else else
{ {
......
...@@ -10,7 +10,8 @@ namespace device { ...@@ -10,7 +10,8 @@ namespace device {
enum struct MaskingSpecialization enum struct MaskingSpecialization
{ {
MaskDisabled, MaskDisabled,
MaskOutUpperTriangle MaskOutUpperTriangle,
MaskUpperTringleFromBottonRight
}; };
inline std::string getMaskingSpecializationString(const MaskingSpecialization& s) inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
...@@ -19,6 +20,8 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s ...@@ -19,6 +20,8 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s
{ {
case MaskingSpecialization::MaskDisabled: return "MaskDisabled"; case MaskingSpecialization::MaskDisabled: return "MaskDisabled";
case MaskingSpecialization::MaskOutUpperTriangle: return "MaskOutUpperTriangle"; case MaskingSpecialization::MaskOutUpperTriangle: return "MaskOutUpperTriangle";
case MaskingSpecialization::MaskUpperTringleFromBottonRight:
return "MaskUpperTringleFromBottonRight";
default: return "Unrecognized specialization!"; default: return "Unrecognized specialization!";
} }
} }
...@@ -47,13 +50,37 @@ struct MaskOutUpperTrianglePredicate ...@@ -47,13 +50,37 @@ struct MaskOutUpperTrianglePredicate
return operator()(m + m_tile - 1, n); return operator()(m + m_tile - 1, n);
} }
}; };
struct MaskUpperTringleFromBottonRightPredicate
{
__host__ __device__ void SetOffset(const index_t offset) { offset_ = offset; }
__host__ __device__ constexpr bool operator()(index_t m, index_t n) const
{
return n > m + offset_;
}
__host__ __device__ constexpr bool
IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t /*n_tile*/) const
{
return operator()(m + m_tile - 1, n);
}
private:
index_t offset_;
};
// to track the points which need to be set to -inf on C0 // to track the points which need to be set to -inf on C0
// Note: no need to reset M padding value, because they will not be stored out. // Note: no need to reset M padding value, because they will not be stored out.
template <typename MaskOutPredicate> template <typename MaskOutPredicate>
struct C0MatrixMask_impl struct C0MatrixMask_impl
{ {
C0MatrixMask_impl(index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{}) {} C0MatrixMask_impl(index_t MRaw, index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{})
{
if constexpr(std::is_same<MaskOutPredicate,
MaskUpperTringleFromBottonRightPredicate>::value)
{
predicate_.SetOffset(NRaw - MRaw);
}
}
__host__ __device__ constexpr bool IsNOutOfBound(/*index_t m, */ index_t n) const __host__ __device__ constexpr bool IsNOutOfBound(/*index_t m, */ index_t n) const
{ {
......
...@@ -1935,8 +1935,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1 ...@@ -1935,8 +1935,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local = auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
if(c0_matrix_mask.IsMaskedElement(m_global, n_global)) if(c0_matrix_mask.IsMaskedElement(m_global, n_global))
{ {
s_slash_p_thread_buf(i) = -ck::NumericLimits<float>::Infinity(); s_slash_p_thread_buf(i) = -ck::NumericLimits<float>::Infinity();
......
...@@ -1948,54 +1948,61 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -1948,54 +1948,61 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
constexpr auto position_offset = M3 * M4; constexpr auto position_offset = M3 * M4;
// save z to global // save z to global
if constexpr(IsDropout){ if constexpr(IsDropout)
{
if(p_z_grid) if(p_z_grid)
{ {
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded + auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
n_global; // unique element global 1d id n_global; // unique element global 1d id
auto global_elem_id = auto global_elem_id =
(global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4; (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf), blockwise_dropout
decltype(z_tenor_buffer), .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
decltype(position_offset), decltype(z_tenor_buffer),
true>( decltype(position_offset),
s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded); true>(
s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), z_thread_copy_vgpr_to_global.Run(
z_tenor_buffer, z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3, make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
z_grid_buf); z_tenor_buffer,
z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
z_grid_buf);
} }
else else
{ {
ignore = z_grid_buf; ignore = z_grid_buf;
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded + auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
n_global; // unique element global 1d id n_global; // unique element global 1d id
auto global_elem_id = auto global_elem_id =
(global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4; (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
// P_dropped // P_dropped
blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf), blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
decltype(position_offset), decltype(position_offset),
true>( true>(
s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded); s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
} }
} }
......
...@@ -1864,53 +1864,60 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1864,53 +1864,60 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
constexpr auto position_offset = M3 * M4; constexpr auto position_offset = M3 * M4;
// save z to global // save z to global
if constexpr(IsDropout){ if constexpr(IsDropout)
{
if(p_z_grid) if(p_z_grid)
{ {
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded + auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
n_global; // unique element global 1d id n_global; // unique element global 1d id
auto global_elem_id = auto global_elem_id =
(global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4; (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf), blockwise_dropout
decltype(z_tenor_buffer), .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
decltype(position_offset), decltype(z_tenor_buffer),
true>( decltype(position_offset),
s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded); true>(
s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), z_thread_copy_vgpr_to_global.Run(
z_tenor_buffer, z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3, make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
z_grid_buf); z_tenor_buffer,
z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
z_grid_buf);
} }
else else
{ {
ignore = z_grid_buf; ignore = z_grid_buf;
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded + auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
n_global; // unique element global 1d id n_global; // unique element global 1d id
auto global_elem_id = auto global_elem_id =
(global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4; (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
// P_dropped // P_dropped
blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf), blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
decltype(position_offset), decltype(position_offset),
true>( true>(
s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded); s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
} }
} }
......
...@@ -917,7 +917,7 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V1 ...@@ -917,7 +917,7 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V1
{ {
block_sync_lds(); block_sync_lds();
} }
do do
{ {
auto n_block_data_idx_on_grid = auto n_block_data_idx_on_grid =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment