Add synchronization into MFMA kernels

70c70d6c · Andriy Roshchenko · f1f36a61 · 70c70d6c
Commit 70c70d6c authored Feb 04, 2025 by Andriy Roshchenko
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 0 deletions

test/mx_mfma_op/mx_mfma_op.hpp test/mx_mfma_op/mx_mfma_op.hpp +4 -0

No files found.
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -643,7 +643,9 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
    // Matrix multiply-accumulate using MFMA units
    // Accumulation intermediate = BLOCK_M x BLOCK_N
+    __syncthreads();
    mfma_type_selector<AFragT, BFragT, AccumFragT, BLOCK_M, BLOCK_N>{}(fragA, fragB, fragAcc);
+    __syncthreads();
    for(int i = 0; i < vectorSize(fragC); ++i)
    {
@@ -696,8 +698,10 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
    // Scaled Matrix multiply-accumulate using MFMA units
    // Accumulation intermediate = BLOCK_M x BLOCK_N
+    __syncthreads();
    mfma_type_selector<AFragT, BFragT, AccumFragT, BLOCK_M, BLOCK_N>{}(
        fragA, fragXa, fragB, fragXb, fragAcc);
+    __syncthreads();
    for(int i = 0; i < vectorSize(fragC); ++i)
    {