Commit 733c351d authored by Adam Osewski's avatar Adam Osewski
Browse files

Use GLC bit when reading/writing to workspace mem.

parent ae20247a
...@@ -907,8 +907,10 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2 ...@@ -907,8 +907,10 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
Sequence<7>{})); Sequence<7>{}));
auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace); auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace);
auto w_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto w_grid_buf =
p_workspace_grid, workspace_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize()); make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
p_workspace_grid,
workspace_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 = constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
BlockwiseGemmT::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); BlockwiseGemmT::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
...@@ -1070,7 +1072,7 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2 ...@@ -1070,7 +1072,7 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
make_multi_index(n_thread_data_on_block)); make_multi_index(n_thread_data_on_block));
auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace); auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace);
auto w_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto w_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global AmdBufferCoherenceEnum::GLC>(
p_workspace_grid, workspace_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize()); p_workspace_grid, workspace_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
auto acc_load = ThreadwiseTensorSliceTransfer_v2< auto acc_load = ThreadwiseTensorSliceTransfer_v2<
...@@ -1103,7 +1105,7 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2 ...@@ -1103,7 +1105,7 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
// We do not need to read this workgroup partial results since they're // We do not need to read this workgroup partial results since they're
// already in c_thread_buff // already in c_thread_buff
for(uint32_t i_t = 1; i_t <= reduce_count; ++i_t) for(uint32_t i_t = 1; i_t < reduce_count; ++i_t)
{ {
acc_buf.Clear(); acc_buf.Clear();
acc_load.Run(workspace_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, acc_load.Run(workspace_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment