fix clang format and test/wmma_op

256c4947 · illsilin · 7f015d49 · 256c4947 · 256c4947
Commit 256c4947 authored May 10, 2024 by illsilin
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 2 deletions

include/ck_tile/core/config.hpp include/ck_tile/core/config.hpp +2 -2

test/wmma_op/wmma_op_util.hpp test/wmma_op/wmma_op_util.hpp +16 -0

No files found.
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -143,8 +143,8 @@
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
 #elif defined(__gfx1030__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
-#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1200__) || defined(__gfx1201__) // for GPU code
+#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx1200__) || defined(__gfx1201__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
 #endif


--- a/test/wmma_op/wmma_op_util.hpp
+++ b/test/wmma_op/wmma_op_util.hpp
@@ -140,10 +140,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
        p_shared[8 * 16 * lane_hi + 8 * lane_lo + ele + 16 * 16] = b_temp[ele];
    }

+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif

    for(int ele = 0; ele < 16; ++ele)
    {
@@ -155,10 +163,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
        a_frag[ele] = p_shared[(ele / 8) * 16 * 8 + 8 * lane + ele % 8];
    }

+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif

    // sync threads, similar to mma_sync
    // __syncthreads();