Fix Coherency bits and for gmem ordering through compiler builtins.

cfbef574 · Adam Osewski · 160932b6 · cfbef574 · cfbef574 · cfbef574
Commit cfbef574 authored May 14, 2024 by Adam Osewski
3 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
@@ -164,6 +164,14 @@ __global__ void
        // do CShuffle in flight with loading partials products of other peer workgroups.
        GridwiseGemm::StorePartials(p_workspace, static_cast<void*>(p_shared), results_buffer);
+#if 0
+        // make sure all writes to gmem has finished.
+        __builtin_amdgcn_s_waitcnt(0x0f70); // s_waitcnt vmcnt(0)
+        // __builtin_amdgcn_s_waitcnt(0x0070);         // s_waitcnt vmcnt(0) lgkmcnt(0)
+        __builtin_amdgcn_s_barrier(); // s_barrier
+        // __builtin_amdgcn_sched_barrier(0x0001);  // allow all non-memory instructions to pass
+        __builtin_amdgcn_sched_barrier(0);
+#endif
        work_scheduler.FlagFinished();
        // The workgroup which processed first K tile accumulates results and stores to GMEM

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle_v2.hpp
@@ -1019,7 +1019,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
            MakeWorkspaceGridDesc_GridSize_MPerBlock_I1_NPerBlock(get_grid_size());
        auto p_workspace_grid = reinterpret_cast<AccDataType*>(p_workspace);
        auto w_grid_buf =
+#if(defined(__gfx908__) || defined(__gfx90a__))
            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
+#elif defined(__gfx94__)
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT0>(
+#else // for host
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::DefaultCoherence>(
+#endif
                p_workspace_grid, workspace_grid_desc_m0_m1_n0_n1.GetElementSpaceSize());
        // shuffle: blockwise copy C from LDS to workspace
@@ -1187,7 +1193,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
        auto p_workspace_grid = reinterpret_cast<CShuffleDataType*>(p_workspace);
        auto w_grid_buf =
+#if(defined(__gfx908__) || defined(__gfx90a__))
            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
+#elif defined(__gfx94__)
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT0>(
+#else // for host
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::DefaultCoherence>(
+#endif
                p_workspace_grid, workspace_grid_desc_m0m1_n0n1n2.GetElementSpaceSize());
        auto acc_load = ThreadwiseTensorSliceTransfer_v2<

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -297,17 +297,17 @@ enum struct AmdBufferCoherenceEnum
    GLC              = 1,
    SLC              = 2,
    GLC_SLC          = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
    WAVE_NT0   = 0,
    WAVE_NT1   = 2,
    GROUP_NT0  = 1,
    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
+    DEVICE_NT0 = 16,
-    DEVICE_NT1 = 10,
+    DEVICE_NT1 = 18,
-    SYSTEM_NT0 = 9,
+    SYSTEM_NT0 = 17,
-    SYSTEM_NT1 = 11,
+    SYSTEM_NT1 = 19,
 };
 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>