Commit bb7294ae authored by rocking's avatar rocking
Browse files

Prevent redundant IO

parent 3df07c27
...@@ -1030,6 +1030,8 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle ...@@ -1030,6 +1030,8 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
mean_thread_buf(j), var_thread_buf(j), count_thread_buf(j)); mean_thread_buf(j), var_thread_buf(j), count_thread_buf(j));
}); });
if(post_shuffle_thread_cluster_idx[I1] == 0)
{
constexpr auto thread_welford_desc_I_m_I = make_naive_tensor_descriptor_packed( constexpr auto thread_welford_desc_I_m_I = make_naive_tensor_descriptor_packed(
make_tuple(I1, Number<PostShuffleThreadSliceSize_M>{}, I1)); make_tuple(I1, Number<PostShuffleThreadSliceSize_M>{}, I1));
...@@ -1048,7 +1050,8 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle ...@@ -1048,7 +1050,8 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
1, 1,
InMemoryDataOperationEnum::Set, InMemoryDataOperationEnum::Set,
1, 1,
false>{mean_var_grid_desc_mblock_mperblock_nblock, false>{
mean_var_grid_desc_mblock_mperblock_nblock,
make_multi_index(block_work_idx[I0], // mblock make_multi_index(block_work_idx[I0], // mblock
shuffleMPerBlock * i + shuffleMPerBlock * i +
post_shuffle_thread_data_idx_begin[I0], // mperblock post_shuffle_thread_data_idx_begin[I0], // mperblock
...@@ -1067,13 +1070,29 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle ...@@ -1067,13 +1070,29 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
1, 1,
InMemoryDataOperationEnum::Set, InMemoryDataOperationEnum::Set,
1, 1,
false>{mean_var_grid_desc_mblock_mperblock_nblock, false>{
mean_var_grid_desc_mblock_mperblock_nblock,
make_multi_index(block_work_idx[I0], // mblock make_multi_index(block_work_idx[I0], // mblock
shuffleMPerBlock * i + shuffleMPerBlock * i +
post_shuffle_thread_data_idx_begin[I0], // mperblock post_shuffle_thread_data_idx_begin[I0], // mperblock
block_work_idx[I1]), // nblock block_work_idx[I1]), // nblock
tensor_operation::element_wise::PassThrough{}}; tensor_operation::element_wise::PassThrough{}};
mean_thread_copy_vgpr_to_global.Run(thread_welford_desc_I_m_I,
make_tuple(I0, I0, I0),
mean_thread_buf,
mean_var_grid_desc_mblock_mperblock_nblock,
mean_grid_buf);
var_thread_copy_vgpr_to_global.Run(thread_welford_desc_I_m_I,
make_tuple(I0, I0, I0),
var_thread_buf,
mean_var_grid_desc_mblock_mperblock_nblock,
var_grid_buf);
if(i == 0 && block_work_idx[I0] == 0 &&
post_shuffle_thread_data_idx_begin[I0] == 0)
{
auto count_thread_copy_vgpr_to_global = ThreadwiseTensorSliceTransfer_v1r3< auto count_thread_copy_vgpr_to_global = ThreadwiseTensorSliceTransfer_v1r3<
int32_t, int32_t,
int32_t, int32_t,
...@@ -1087,29 +1106,21 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle ...@@ -1087,29 +1106,21 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
InMemoryDataOperationEnum::Set, InMemoryDataOperationEnum::Set,
1, 1,
false>{count_grid_desc_mblock_mperblock_nblock, false>{count_grid_desc_mblock_mperblock_nblock,
make_multi_index(block_work_idx[I0], // mblock make_multi_index(
block_work_idx[I0], // mblock
shuffleMPerBlock * i + shuffleMPerBlock * i +
post_shuffle_thread_data_idx_begin[I0], // mperblock post_shuffle_thread_data_idx_begin[I0], // mperblock
block_work_idx[I1]), // nblock block_work_idx[I1]), // nblock
tensor_operation::element_wise::PassThrough{}}; tensor_operation::element_wise::PassThrough{}};
mean_thread_copy_vgpr_to_global.Run(thread_welford_desc_I_m_I, count_thread_copy_vgpr_to_global.Run(
make_tuple(I0, I0, I0), thread_welford_desc_I_m_I,
mean_thread_buf,
mean_var_grid_desc_mblock_mperblock_nblock,
mean_grid_buf);
var_thread_copy_vgpr_to_global.Run(thread_welford_desc_I_m_I,
make_tuple(I0, I0, I0),
var_thread_buf,
mean_var_grid_desc_mblock_mperblock_nblock,
var_grid_buf);
count_thread_copy_vgpr_to_global.Run(thread_welford_desc_I_m_I,
make_tuple(I0, I0, I0), make_tuple(I0, I0, I0),
count_thread_buf, count_thread_buf,
count_grid_desc_mblock_mperblock_nblock, count_grid_desc_mblock_mperblock_nblock,
welford_count_grid_buf); welford_count_grid_buf);
}
}
}); });
} // shuffle C + Ds + welford + write out } // shuffle C + Ds + welford + write out
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment