Commit cfbef574 authored by Adam Osewski's avatar Adam Osewski
Browse files

Fix Coherency bits and for gmem ordering through compiler builtins.

parent 160932b6
......@@ -164,6 +164,14 @@ __global__ void
// do CShuffle in flight with loading partials products of other peer workgroups.
GridwiseGemm::StorePartials(p_workspace, static_cast<void*>(p_shared), results_buffer);
#if 0
// make sure all writes to gmem has finished.
__builtin_amdgcn_s_waitcnt(0x0f70); // s_waitcnt vmcnt(0)
// __builtin_amdgcn_s_waitcnt(0x0070); // s_waitcnt vmcnt(0) lgkmcnt(0)
__builtin_amdgcn_s_barrier(); // s_barrier
// __builtin_amdgcn_sched_barrier(0x0001); // allow all non-memory instructions to pass
__builtin_amdgcn_sched_barrier(0);
#endif
work_scheduler.FlagFinished();
// The workgroup which processed first K tile accumulates results and stores to GMEM
......
......@@ -1019,7 +1019,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
MakeWorkspaceGridDesc_GridSize_MPerBlock_I1_NPerBlock(get_grid_size());
auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace);
auto w_grid_buf =
#if(defined(__gfx908__) || defined(__gfx90a__))
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
#elif defined(__gfx94__)
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT0>(
#else // for host
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::DefaultCoherence>(
#endif
p_workspace_grid, workspace_grid_desc_m0_m1_n0_n1.GetElementSpaceSize());
// shuffle: blockwise copy C from LDS to workspace
......@@ -1187,7 +1193,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
auto p_workspace_grid = reinterpret_cast<CShuffleDataType*>(p_workspace);
auto w_grid_buf =
#if(defined(__gfx908__) || defined(__gfx90a__))
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
#elif defined(__gfx94__)
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT0>(
#else // for host
make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::DefaultCoherence>(
#endif
p_workspace_grid, workspace_grid_desc_m0m1_n0n1n2.GetElementSpaceSize());
auto acc_load = ThreadwiseTensorSliceTransfer_v2<
......
......@@ -297,17 +297,17 @@ enum struct AmdBufferCoherenceEnum
GLC = 1,
SLC = 2,
GLC_SLC = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0,
WAVE_NT1 = 2,
GROUP_NT0 = 1,
GROUP_NT1 = 3,
DEVICE_NT0 = 8,
DEVICE_NT1 = 10,
SYSTEM_NT0 = 9,
SYSTEM_NT1 = 11,
DEVICE_NT0 = 16,
DEVICE_NT1 = 18,
SYSTEM_NT0 = 17,
SYSTEM_NT1 = 19,
};
template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment