Merge branch 'develop' into universal_streamk

909f519c · Harisankar Sadasivan · GitHub · 406fa265 · 3bb0fe6c · 909f519c
Unverified Commit 909f519c authored Jun 27, 2024 by Harisankar Sadasivan Committed by GitHub Jun 27, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 105 additions and 0 deletions

test/smfmac_op/smfmac_op_xdl.cpp test/smfmac_op/smfmac_op_xdl.cpp +89 -0

test/wmma_op/wmma_op_util.hpp test/wmma_op/wmma_op_util.hpp +16 -0

No files found.
--- a/test/smfmac_op/smfmac_op_xdl.cpp
+++ b/test/smfmac_op/smfmac_op_xdl.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "test/smfmac_op/smfmac_op_util.hpp"
+
+using BF16        = ck::bhalf_t;
+using F16         = ck::half_t;
+using F32         = float;
+using Row         = ck::tensor_layout::gemm::RowMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+template <typename Tuple>
+class TestSmfmac : public ::testing::Test
+{
+    protected:
+    using Src1Type                           = std::tuple_element_t<0, Tuple>;
+    static constexpr ck::index_t Src1VecSize = std::tuple_element_t<1, Tuple>{}.value;
+    using Src2Type                           = std::tuple_element_t<2, Tuple>;
+    static constexpr ck::index_t Src2VecSize = std::tuple_element_t<3, Tuple>{}.value;
+    using DstType                            = std::tuple_element_t<4, Tuple>;
+    static constexpr ck::index_t AccVecSize  = std::tuple_element_t<5, Tuple>{}.value;
+    using GPUAccType                         = std::tuple_element_t<6, Tuple>;
+    using CPUAccType                         = std::tuple_element_t<7, Tuple>;
+    static constexpr ck::index_t M           = std::tuple_element_t<8, Tuple>{}.value;
+    static constexpr ck::index_t N           = std::tuple_element_t<9, Tuple>{}.value;
+    static constexpr ck::index_t K           = std::tuple_element_t<10, Tuple>{}.value;
+
+    void Run()
+    {
+        bool pass                     = true;
+        constexpr auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
+                                                                   Src1VecSize,
+                                                                   Src2Type,
+                                                                   Src2VecSize,
+                                                                   GPUAccType,
+                                                                   AccVecSize,
+                                                                   DstType,
+                                                                   M,
+                                                                   N,
+                                                                   K>;
+
+        constexpr auto smfmac_kernel_container = std::make_tuple(matmul_default);
+
+        ck::static_for<0, std::tuple_size_v<decltype(smfmac_kernel_container)>, 1>{}([&](auto i) {
+            pass &= ck::smfmac_op_util::TestSmfmac<
+                std::tuple_element_t<i.value, decltype(smfmac_kernel_container)>,
+                Src1Type,
+                Src2Type,
+                DstType,
+                GPUAccType,
+                CPUAccType,
+                decltype(Row{}),
+                decltype(Row{}),
+                decltype(Row{}),
+                PassThrough,
+                PassThrough,
+                PassThrough,
+                AccVecSize,
+                M,
+                N,
+                K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
+        });
+
+        EXPECT_TRUE(pass);
+    }
+};
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, I<4>, F16, I<8>, F32, I<4>, F32, F32, I<16>, I<16>, I<32>>,
+                     std::tuple<BF16, I<4>, BF16, I<8>, F32, I<4>, F32, F32, I<16>, I<16>, I<32>>,
+                     std::tuple<F16, I<4>, F16, I<8>, F32, I<16>, F32, F32, I<32>, I<32>, I<16>>,
+                     std::tuple<BF16, I<4>, BF16, I<8>, F32, I<16>, F32, F32, I<32>, I<32>, I<16>>>;
+
+TYPED_TEST_SUITE(TestSmfmac, KernelTypes);
+TYPED_TEST(TestSmfmac, TestSmfmacFP16BF16) { this->Run(); }
--- a/test/wmma_op/wmma_op_util.hpp
+++ b/test/wmma_op/wmma_op_util.hpp
@@ -140,10 +140,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
        p_shared[8 * 16 * lane_hi + 8 * lane_lo + ele + 16 * 16] = b_temp[ele];
    }

+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif

    for(int ele = 0; ele < 16; ++ele)
    {
@@ -155,10 +163,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
        a_frag[ele] = p_shared[(ele / 8) * 16 * 8 + 8 * lane + ele % 8];
    }

+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif

    // sync threads, similar to mma_sync
    // __syncthreads();