"...composable_kernel.git" did not exist on "84dcf5d043251ec166f7428c1e2b294500492453"
Commit cfbef574 authored by Adam Osewski's avatar Adam Osewski
Browse files

Fix Coherency bits and for gmem ordering through compiler builtins.

parent 160932b6
...@@ -164,6 +164,14 @@ __global__ void ...@@ -164,6 +164,14 @@ __global__ void
// do CShuffle in flight with loading partials products of other peer workgroups. // do CShuffle in flight with loading partials products of other peer workgroups.
GridwiseGemm::StorePartials(p_workspace, static_cast<void*>(p_shared), results_buffer); GridwiseGemm::StorePartials(p_workspace, static_cast<void*>(p_shared), results_buffer);
#if 0
// make sure all writes to gmem has finished.
__builtin_amdgcn_s_waitcnt(0x0f70); // s_waitcnt vmcnt(0)
// __builtin_amdgcn_s_waitcnt(0x0070); // s_waitcnt vmcnt(0) lgkmcnt(0)
__builtin_amdgcn_s_barrier(); // s_barrier
// __builtin_amdgcn_sched_barrier(0x0001); // allow all non-memory instructions to pass
__builtin_amdgcn_sched_barrier(0);
#endif
work_scheduler.FlagFinished(); work_scheduler.FlagFinished();
// The workgroup which processed first K tile accumulates results and stores to GMEM // The workgroup which processed first K tile accumulates results and stores to GMEM
......
...@@ -1019,7 +1019,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2 ...@@ -1019,7 +1019,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
MakeWorkspaceGridDesc_GridSize_MPerBlock_I1_NPerBlock(get_grid_size()); MakeWorkspaceGridDesc_GridSize_MPerBlock_I1_NPerBlock(get_grid_size());
auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace); auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace);
auto w_grid_buf = auto w_grid_buf =
#if(defined(__gfx908__) || defined(__gfx90a__))
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>( make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
#elif defined(__gfx94__)
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT0>(
#else // for host
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::DefaultCoherence>(
#endif
p_workspace_grid, workspace_grid_desc_m0_m1_n0_n1.GetElementSpaceSize()); p_workspace_grid, workspace_grid_desc_m0_m1_n0_n1.GetElementSpaceSize());
// shuffle: blockwise copy C from LDS to workspace // shuffle: blockwise copy C from LDS to workspace
...@@ -1187,7 +1193,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2 ...@@ -1187,7 +1193,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
auto p_workspace_grid = reinterpret_cast<CShuffleDataType*>(p_workspace); auto p_workspace_grid = reinterpret_cast<CShuffleDataType*>(p_workspace);
auto w_grid_buf = auto w_grid_buf =
#if(defined(__gfx908__) || defined(__gfx90a__))
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>( make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
#elif defined(__gfx94__)
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT0>(
#else // for host
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::DefaultCoherence>(
#endif
p_workspace_grid, workspace_grid_desc_m0m1_n0n1n2.GetElementSpaceSize()); p_workspace_grid, workspace_grid_desc_m0m1_n0n1n2.GetElementSpaceSize());
auto acc_load = ThreadwiseTensorSliceTransfer_v2< auto acc_load = ThreadwiseTensorSliceTransfer_v2<
......
...@@ -297,17 +297,17 @@ enum struct AmdBufferCoherenceEnum ...@@ -297,17 +297,17 @@ enum struct AmdBufferCoherenceEnum
GLC = 1, GLC = 1,
SLC = 2, SLC = 2,
GLC_SLC = 3, GLC_SLC = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0, WAVE_NT0 = 0,
WAVE_NT1 = 2, WAVE_NT1 = 2,
GROUP_NT0 = 1, GROUP_NT0 = 1,
GROUP_NT1 = 3, GROUP_NT1 = 3,
DEVICE_NT0 = 8, DEVICE_NT0 = 16,
DEVICE_NT1 = 10, DEVICE_NT1 = 18,
SYSTEM_NT0 = 9, SYSTEM_NT0 = 17,
SYSTEM_NT1 = 11, SYSTEM_NT1 = 19,
}; };
template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment