"include/vscode:/vscode.git/clone" did not exist on "cc9d2a84bf3e49482416e0fcf1f8a5cc5d11730c"
Commit 52478ac3 authored by ltqin's avatar ltqin
Browse files

change name to d_thread_desc_mblock_m1

parent 2aa9cbee
...@@ -156,7 +156,8 @@ __global__ void ...@@ -156,7 +156,8 @@ __global__ void
} }
else else
{ {
GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset, GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
p_a_grid + a_batch_offset,
p_b_grid + b_batch_offset, p_b_grid + b_batch_offset,
z_matrix_ptr, z_matrix_ptr,
p_b1_grid + b1_batch_offset, p_b1_grid + b1_batch_offset,
...@@ -1000,10 +1001,15 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -1000,10 +1001,15 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
arg.m_raw_padded_, arg.m_raw_padded_,
arg.n_raw_padded_); arg.n_raw_padded_);
}; };
if(arg.p_drop_ > 0.0){ if(arg.p_drop_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); {
}else{ ave_time = launch_kernel(integral_constant<bool, false>{},
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); integral_constant<bool, true>{});
}
else
{
ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
return ave_time; return ave_time;
} }
......
...@@ -155,7 +155,8 @@ __global__ void ...@@ -155,7 +155,8 @@ __global__ void
} }
else else
{ {
GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset, GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
p_a_grid + a_batch_offset,
p_b_grid + b_batch_offset, p_b_grid + b_batch_offset,
z_matrix_ptr, z_matrix_ptr,
p_b1_grid + b1_batch_offset, p_b1_grid + b1_batch_offset,
...@@ -1024,16 +1025,20 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1024,16 +1025,20 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
{ {
if(arg.p_drop_ > 0.0) if(arg.p_drop_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, false>{});
} }
else else
{ {
if(arg.p_drop_ > 0.0) if(arg.p_drop_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
return ave_time; return ave_time;
......
...@@ -999,16 +999,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -999,16 +999,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
if(all_has_main_k_block_loop) if(all_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, false>{});
} }
else if(!some_has_main_k_block_loop) else if(!some_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
else else
{ {
......
...@@ -1006,16 +1006,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1006,16 +1006,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
if(all_has_main_k_block_loop) if(all_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, true>{},
integral_constant<bool, false>{});
} }
else if(!some_has_main_k_block_loop) else if(!some_has_main_k_block_loop)
{ {
if(arg.p_dropout_ > 0.0) if(arg.p_dropout_ > 0.0)
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, true>{});
else else
ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{}); ave_time = launch_kernel(integral_constant<bool, false>{},
integral_constant<bool, false>{});
} }
else else
{ {
......
...@@ -1948,13 +1948,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -1948,13 +1948,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
constexpr auto position_offset = M3 * M4; constexpr auto position_offset = M3 * M4;
// save z to global // save z to global
if constexpr(IsDropout){ if constexpr(IsDropout)
{
if(p_z_grid) if(p_z_grid)
{ {
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
...@@ -1964,13 +1967,15 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -1964,13 +1967,15 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
auto global_elem_id = auto global_elem_id =
(global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4; (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf), blockwise_dropout
.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
decltype(z_tenor_buffer), decltype(z_tenor_buffer),
decltype(position_offset), decltype(position_offset),
true>( true>(
s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded); s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3, z_thread_copy_vgpr_to_global.Run(
z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
z_tenor_buffer, z_tenor_buffer,
z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3, z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
...@@ -1981,8 +1986,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1 ...@@ -1981,8 +1986,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
ignore = z_grid_buf; ignore = z_grid_buf;
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
......
...@@ -1864,13 +1864,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1864,13 +1864,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
constexpr auto position_offset = M3 * M4; constexpr auto position_offset = M3 * M4;
// save z to global // save z to global
if constexpr(IsDropout){ if constexpr(IsDropout)
{
if(p_z_grid) if(p_z_grid)
{ {
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
...@@ -1880,13 +1883,15 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1880,13 +1883,15 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
auto global_elem_id = auto global_elem_id =
(global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4; (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf), blockwise_dropout
.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
decltype(z_tenor_buffer), decltype(z_tenor_buffer),
decltype(position_offset), decltype(position_offset),
true>( true>(
s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded); s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3, z_thread_copy_vgpr_to_global.Run(
z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0), make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
z_tenor_buffer, z_tenor_buffer,
z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3, z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
...@@ -1897,8 +1902,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2 ...@@ -1897,8 +1902,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
ignore = z_grid_buf; ignore = z_grid_buf;
auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin; auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
auto m_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; auto m_local =
auto n_local = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
auto n_local =
block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
auto m_global = m_local + m_block_data_idx_on_grid; auto m_global = m_local + m_block_data_idx_on_grid;
auto n_global = n_local + n_block_data_idx_on_grid; auto n_global = n_local + n_block_data_idx_on_grid;
......
...@@ -165,7 +165,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad ...@@ -165,7 +165,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
const index_t block_work_idx_m = block_work_idx[I0]; const index_t block_work_idx_m = block_work_idx[I0];
constexpr auto d_thread_desc_mblock_mrepeat_mwave_mperxdl = constexpr auto d_thread_desc_mblock_m1 =
make_naive_tensor_descriptor_packed(make_tuple(I1, I1)); make_naive_tensor_descriptor_packed(make_tuple(I1, I1));
constexpr auto y_thread_desc_m0_m1_n0_n1 = make_naive_tensor_descriptor_packed(make_tuple( constexpr auto y_thread_desc_m0_m1_n0_n1 = make_naive_tensor_descriptor_packed(make_tuple(
...@@ -244,7 +244,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad ...@@ -244,7 +244,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
auto d_thread_copy_vgpr_to_global = auto d_thread_copy_vgpr_to_global =
ThreadwiseTensorSliceTransfer_v1r3<FloatD, ThreadwiseTensorSliceTransfer_v1r3<FloatD,
FloatD, FloatD,
decltype(d_thread_desc_mblock_mrepeat_mwave_mperxdl), decltype(d_thread_desc_mblock_m1),
decltype(d_grid_desc_mblock_mperblock), decltype(d_grid_desc_mblock_mperblock),
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
Sequence<1, 1>, Sequence<1, 1>,
...@@ -260,7 +260,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad ...@@ -260,7 +260,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
ck::tensor_operation::element_wise::PassThrough{}}; ck::tensor_operation::element_wise::PassThrough{}};
// copy from VGPR to Global // copy from VGPR to Global
d_thread_copy_vgpr_to_global.Run(d_thread_desc_mblock_mrepeat_mwave_mperxdl, d_thread_copy_vgpr_to_global.Run(d_thread_desc_mblock_m1,
make_tuple(I0, I0), make_tuple(I0, I0),
y_dot_ygrad_thread_accum_buf, y_dot_ygrad_thread_accum_buf,
d_grid_desc_mblock_mperblock, d_grid_desc_mblock_mperblock,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment