Merge branch 'develop' into lwpck-1815

06701e70 · Rostyslav Geyyer · GitHub · 5800d24e · da42a889 · 06701e70
Unverified Commit 06701e70 authored Jul 09, 2024 by Rostyslav Geyyer Committed by GitHub Jul 09, 2024
16 changed files
--- a/profiler/src/profile_grouped_conv_fwd_outelementop.cpp
+++ b/profiler/src/profile_grouped_conv_fwd_outelementop.cpp
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
+#include "ck/utility/data_type.hpp"
+#include "profiler_operation_registry.hpp"
+#include <iostream>
+enum struct ConvLayout
+{
+    GNHWC_GKYXC_GNHWK = 0,
+    NHWGC_GKYXC_NHWGK = 1
+};
+enum struct OutElementOp
+{
+    ConvScale    = 0,
+    ConvInvScale = 1
+};
+enum struct ConvDataType
+{
+    F8_F8_F8   = 0,
+    BF8_BF8_F8 = 1,
+    F8_BF8_F8  = 2,
+    BF8_F8_F8  = 3
+};
+#define OP_NAME "grouped_conv_fwd_outelementop"
+#define OP_DESC "Grouped Convolution Forward+Elementwise Operation"
+static void print_helper_msg()
+{
+    // clang-format off
+    std::cout
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp8, Weight fp8, Output fp8\n"
+        << "                 1: Input bf8, Weight bf8, Output fp8\n"
+        << "                 2: Input fp8, Weight bf8, Output fp8\n"
+        << "                 3: Input bf8, Weight fp8, Output fp8)\n"
+        << "arg3: element-wise operation (0: ConvScale\n"
+        << "                              1: ConvInvScale)\n"
+        << "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
+        << "arg5: verification (0: no, 1: yes)\n"
+        << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg7: print tensor value (0: no; 1: yes)\n"
+        << "arg8: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+int grouped_conv_fwd_outelementop(int argc, char* argv[])
+{
+    // 9 total, 1 for num_dim_spatial
+    if(argc < 10)
+    {
+        print_helper_msg();
+        return 1;
+    }
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto op              = static_cast<OutElementOp>(std::stoi(argv[3]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+    const int num_dim_spatial  = std::stoi(argv[9]);
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + 1 for argv[0]
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
+    {
+        print_helper_msg();
+        return 1;
+    }
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
+    using F8  = ck::f8_t;
+    using BF8 = ck::bf8_t;
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+    using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
+    using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
+    using ConvScale    = ck::tensor_operation::element_wise::ConvScale;
+    using ConvInvScale = ck::tensor_operation::element_wise::ConvInvscale;
+    constexpr auto I3 = ck::Number<3>{};
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type,
+                       auto out_element_op,
+                       auto a_compute_type,
+                       auto b_compute_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+        using OutElementOp = decltype(out_element_op);
+        using AComputeType = decltype(a_compute_type);
+        using BComputeType = decltype(b_compute_type);
+        bool pass = ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
+                                                                             InLayout,
+                                                                             WeiLayout,
+                                                                             OutLayout,
+                                                                             InDataType,
+                                                                             WeiDataType,
+                                                                             OutDataType,
+                                                                             OutElementOp,
+                                                                             AComputeType,
+                                                                             BComputeType>(
+            do_verification, init_method, do_log, time_kernel, params);
+        return pass ? 0 : 1;
+    };
+    if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(op == OutElementOp::ConvScale)
+        {
+            if(data_type == ConvDataType::F8_F8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{}, ConvScale{}, F8{}, F8{});
+            }
+            else if(data_type == ConvDataType::BF8_BF8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               BF8{},
+                               BF8{},
+                               F8{},
+                               ConvScale{},
+                               BF8{},
+                               BF8{});
+            }
+            else if(data_type == ConvDataType::F8_BF8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, BF8{}, F8{}, ConvScale{}, F8{}, BF8{});
+            }
+            else if(data_type == ConvDataType::BF8_F8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, ConvScale{}, BF8{}, F8{});
+            }
+        }
+        else if(op == OutElementOp::ConvInvScale)
+        {
+            if(data_type == ConvDataType::F8_F8_F8)
+            {
+                return profile(
+                    I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{}, ConvInvScale{}, F8{}, F8{});
+            }
+            else if(data_type == ConvDataType::BF8_BF8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               BF8{},
+                               BF8{},
+                               F8{},
+                               ConvInvScale{},
+                               BF8{},
+                               BF8{});
+            }
+            else if(data_type == ConvDataType::F8_BF8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               F8{},
+                               BF8{},
+                               F8{},
+                               ConvInvScale{},
+                               F8{},
+                               BF8{});
+            }
+            else if(data_type == ConvDataType::BF8_F8_F8)
+            {
+                return profile(I3,
+                               NDHWGC{},
+                               GKZYXC{},
+                               NDHWGK{},
+                               BF8{},
+                               F8{},
+                               F8{},
+                               ConvInvScale{},
+                               BF8{},
+                               F8{});
+            }
+        }
+    }
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+    return 1;
+}
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, grouped_conv_fwd_outelementop);
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -98,8 +98,8 @@ int profile_grouped_gemm(int argc, char* argv[])
    int n_iter   = 10;
    if(argc == 17)
    {
-        n_warmup = std::stoi(argv[16]);
+        n_warmup = std::stoi(argv[15]);
-        n_iter   = std::stoi(argv[17]);
+        n_iter   = std::stoi(argv[16]);
    }
 #ifdef CK_ENABLE_FP16

--- a/script/check_copyright_year.sh
+++ b/script/check_copyright_year.sh
--- a/script/profile_grouped_conv_fwd_outelementop.sh
+++ b/script/profile_grouped_conv_fwd_outelementop.sh
+#!/bin/bash
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+OP=$1
+DATATYPE=$2
+OUTELEMENTOP=$3
+LAYOUT=$4
+VERIFY=$5
+INIT=$6
+LOG=$7
+TIME=$8
+N=$9
+#######  op    datatype  OUTELEMENTOP  layout   verify   init   log   time  Ndims  G    N    K     C   Z   Y   X   Di  Hi   Wi  Sz  Sy  Sx  Dz  Dy  Dx  Left Pz LeftPy  LeftPx  RightPz RightPy  RightPx
+$DRIVER $OP   $DATATYPE $OUTELEMENTOP $LAYOUT  $VERIFY  $INIT  $LOG  $TIME      3 32   $N   96    96   3   3   3   28  28   28   1   1   1   1   1   1        1      1       1        1       1        1
+$DRIVER $OP   $DATATYPE $OUTELEMENTOP $LAYOUT  $VERIFY  $INIT  $LOG  $TIME      3 32   $N  192   192   3   3   3   28  28   28   1   1   1   1   1   1        1      1       1        1       1        1
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -60,7 +60,7 @@ function(add_test_executable TEST_NAME)
        endif()
    endforeach()
    foreach(source IN LISTS ARGN)
-        if(NOT TEST_TARGETS MATCHES "gfx11" AND source MATCHES "wmma")
+	if(NOT TEST_TARGETS MATCHES "gfx11" AND NOT TEST_TARGETS MATCHES "gfx12" AND source MATCHES "wmma")
            message("removing wmma test ${source} ")
            list(REMOVE_ITEM ARGN "${source}")
        endif()
@@ -71,6 +71,8 @@ function(add_test_executable TEST_NAME)
             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+        elseif(ARGN MATCHES "_smfmac")
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -139,7 +141,7 @@ function(add_gtest_executable TEST_NAME)
        endif()
    endforeach()
    foreach(source IN LISTS ARGN)
-        if(NOT TEST_TARGETS MATCHES "gfx11" AND source MATCHES "wmma")
+	if(NOT TEST_TARGETS MATCHES "gfx11" AND NOT TEST_TARGETS MATCHES "gfx12" AND source MATCHES "wmma")
            message("removing wmma test ${source} ")
            list(REMOVE_ITEM ARGN "${source}")
        endif()
@@ -150,6 +152,8 @@ function(add_gtest_executable TEST_NAME)
             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+        elseif(ARGN MATCHES "_smfmac")
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -209,4 +213,7 @@ add_subdirectory(wrapper)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
+if(GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2
+    add_subdirectory(smfmac_op)
+endif()
 add_subdirectory(position_embedding)
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -2,11 +2,11 @@ add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_x
 if(result EQUAL 0)
    target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
-add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_xdl.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+    target_link_libraries(test_grouped_convnd_bwd_data_interface_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance)
 endif()
-add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_wmma.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_data_interface_wmma test_grouped_convnd_bwd_data_interface_wmma.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+    target_link_libraries(test_grouped_convnd_bwd_data_interface_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance)
 endif()
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp
@@ -52,6 +52,14 @@ class TestGroupedConvndBwdData : public ::testing::Test
    ck::utils::conv::ConvParam conv_param;
+    void SetUp() override
+    {
+        if(!ck::is_gfx11_supported())
+        {
+            GTEST_SKIP();
+        }
+    }
    template <ck::index_t NDimSpatial>
    bool Run()
    {

--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -5,13 +5,13 @@ if(GPU_TARGETS MATCHES "gfx9" OR DL_KERNELS)
      add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
      target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv3d_bwd_weight_instance)
 endif()
-add_gtest_executable(test_grouped_convnd_bwd_weight_interface test_grouped_convnd_bwd_weight_interface_xdl.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_weight_interface_xdl test_grouped_convnd_bwd_weight_interface_xdl.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_grouped_convnd_bwd_weight_interface PRIVATE utility)
+   target_link_libraries(test_grouped_convnd_bwd_weight_interface_xdl PRIVATE utility)
 endif()
-add_gtest_executable(test_grouped_convnd_bwd_weight_interface test_grouped_convnd_bwd_weight_interface_wmma.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_weight_interface_wmma test_grouped_convnd_bwd_weight_interface_wmma.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_grouped_convnd_bwd_weight_interface PRIVATE utility)
+   target_link_libraries(test_grouped_convnd_bwd_weight_interface_wmma PRIVATE utility)
 endif()
 add_gtest_executable(test_grouped_conv_bwd_weight_xdl_bilinear test_grouped_conv_bwd_weight_xdl_bilinear.cpp)
 if(result EQUAL 0)

--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -44,7 +44,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
            }
        }
-        if(ck::is_gfx11_supported())
+        if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
        {
            // on gfx11 only support for 3d is implemented
            if constexpr(NDimSpatial{} != 3)

--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp
@@ -52,6 +52,14 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
    ck::utils::conv::ConvParam conv_param;
+    void SetUp() override
+    {
+        if(!ck::is_gfx11_supported())
+        {
+            GTEST_SKIP();
+        }
+    }
    template <ck::index_t SplitK>
    bool Run()
    {

--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
 if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11")
    add_gtest_executable(test_grouped_convnd_fwd test_grouped_convnd_fwd.cpp)
-    if(GPU_TARGETS MATCHES "gfx11")
+    if((GPU_TARGETS MATCHES "gfx11") AND (NOT GPU_TARGETS MATCHES "gfx9"))
        target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
    else()
        target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)

--- a/test/smfmac_op/CMakeLists.txt
+++ b/test/smfmac_op/CMakeLists.txt
+add_gtest_executable(test_smfmac_op smfmac_op_xdl.cpp)
+target_link_libraries(test_smfmac_op PRIVATE utility)
--- a/test/smfmac_op/smfmac_op.cpp
+++ b/test/smfmac_op/smfmac_op.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "test/smfmac_op/smfmac_op_util.hpp"
+template <typename Src1Type,
+          ck::index_t Src1VecSize,
+          typename Src2Type,
+          ck::index_t Src2VecSize,
+          typename DstType,
+          ck::index_t AccVecSize,
+          typename GPUAccType,
+          typename CPUAccType,
+          ck::index_t M,
+          ck::index_t N,
+          ck::index_t K>
+bool run_test()
+{
+    using Row         = ck::tensor_layout::gemm::RowMajor;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    bool pass         = true;
+    const auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
+                                                           Src1VecSize,
+                                                           Src2Type,
+                                                           Src2VecSize,
+                                                           GPUAccType,
+                                                           AccVecSize,
+                                                           DstType,
+                                                           M,
+                                                           N,
+                                                           K>;
+    const auto smfmac_kernel_container = std::make_tuple(matmul_default);
+    ck::static_for<0, 1, 1>{}([&](auto i) {
+        pass &=
+            ck::smfmac_op_util::TestSmfmac<decltype(std::get<ck::Number<i>{}>(
+                                               smfmac_kernel_container)),
+                                           Src1Type,
+                                           Src2Type,
+                                           DstType,
+                                           GPUAccType,
+                                           CPUAccType,
+                                           decltype(Row{}),
+                                           decltype(Row{}),
+                                           decltype(Row{}),
+                                           PassThrough,
+                                           PassThrough,
+                                           PassThrough,
+                                           AccVecSize,
+                                           M,
+                                           N,
+                                           K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
+    });
+    return pass;
+}
+int main(int, char*[])
+{
+    bool pass = true;
+    // clang-format off
+    //              |   Src1Type| Src1VecSize|    Src2Type| Src2VecSize| DstType| DstVecSize|  GPUAccType| CPUAccType| M| N| K|
+    pass &= run_test< ck::half_t,           4,  ck::half_t,           8,   float,          4,       float,      float,16,16,32>();
+    pass &= run_test<ck::bhalf_t,           4, ck::bhalf_t,           8,   float,          4,       float,      float,16,16,32>();
+    pass &= run_test< ck::half_t,           4,  ck::half_t,           8,   float,         16,       float,      float,32,32,16>();
+    pass &= run_test<ck::bhalf_t,           4, ck::bhalf_t,           8,   float,         16,       float,      float,32,32,16>();
+    // clang-format on
+    std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+    return pass;
+}
--- a/test/smfmac_op/smfmac_op_util.hpp
+++ b/test/smfmac_op/smfmac_op_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/utility/amd_smfmac.hpp"
+#include "ck/library/utility/fill.hpp"
+namespace ck {
+namespace smfmac_op_util {
+template <typename src_vec1, typename src_vec2, typename acc_vec>
+__device__ void
+builtin_smfmac_naive_selector(const src_vec1&, const src_vec2&, const int32_t&, acc_vec&)
+{
+}
+template <>
+__device__ void
+builtin_smfmac_naive_selector<half4_t,
+                              half8_t,
+                              StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>>(
+    const half4_t& reg_a,
+    const half8_t& reg_b,
+    const int32_t& reg_idx,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>& reg_c)
+{
+    intrin_smfmac_f32_16x16x32f16<16, 16>::Run(
+        reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+template <>
+__device__ void
+builtin_smfmac_naive_selector<bhalf4_t,
+                              bhalf8_t,
+                              StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>>(
+    const bhalf4_t& reg_a,
+    const bhalf8_t& reg_b,
+    const int32_t& reg_idx,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>& reg_c)
+{
+    intrin_smfmac_f32_16x16x32bf16<16, 16>::Run(
+        reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+template <>
+__device__ void builtin_smfmac_naive_selector<
+    half4_t,
+    half8_t,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>>(
+    const half4_t& reg_a,
+    const half8_t& reg_b,
+    const int32_t& reg_idx,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>& reg_c)
+{
+    intrin_smfmac_f32_32x32x16f16<32, 32>::Run(
+        reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+template <>
+__device__ void builtin_smfmac_naive_selector<
+    bhalf4_t,
+    bhalf8_t,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>>(
+    const bhalf4_t& reg_a,
+    const bhalf8_t& reg_b,
+    const int32_t& reg_idx,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>& reg_c)
+{
+    intrin_smfmac_f32_32x32x16bf16<32, 32>::Run(
+        reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+// Smfmac instructions are using 4:2 structural sparsity, that means that in every contignuous
+// subgroup of 4 elements, atleast 2 must be equal to zero and the position of non-zero elements is
+// stored in idx register to allow selection of corresponding B matrix elements for multiplication.
+// Currently smfmac instructions support only A matrix as sparse
+template <typename src1_t,
+          index_t src1_vec_size,
+          typename src2_t,
+          index_t src2_vec_size,
+          typename acc_t,
+          index_t acc_vec_size,
+          typename dst_t,
+          int32_t M,
+          int32_t N,
+          int32_t K>
+__global__ void matmul(const src1_t* a, const src2_t* b, dst_t* c)
+{
+    __shared__ src1_t a_shared[M * K];
+    __shared__ src2_t b_shared[K * N];
+    const int lane = threadIdx.x;
+    // smfmac's A part is storing only non-zero elements in 2VGPRs
+    // smfmac's B part is storing all elements in 4VGPRs
+    using src1_vec      = typename vector_type<src1_t, src1_vec_size>::type;
+    using src1_full_vec = typename vector_type<src1_t, src1_vec_size * 2>::type;
+    using src2_vec      = typename vector_type<src2_t, src2_vec_size>::type;
+    src1_vec a_frag     = {};
+    src2_vec b_frag     = {};
+    src1_full_vec a_temp = {};
+    src2_vec b_temp      = {};
+    // initialize c fragment to 0
+    using acc_vec = StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, acc_t, 1, acc_vec_size, true>;
+    acc_vec c_thread_buf_;
+    for(int i = 0; i < 8; ++i)
+    {
+        a_temp[i] = a[(lane % M) * K + (lane / M) * 8 + i]; // M K
+    }
+    for(int i = 0; i < 8; ++i)
+    {
+        b_temp[i] = b[(8 * (lane / N) + i) * N + (lane % N)]; // K N
+    }
+    __syncthreads();
+    for(int i = 0; i < 8; ++i)
+    {
+        a_shared[(lane % M) * K + (lane / M) * 8 + i] = a_temp[i];
+    }
+    for(int i = 0; i < 8; ++i)
+    {
+        b_shared[(8 * (lane / N) + i) * N + (lane % N)] = b_temp[i];
+    }
+    __syncthreads();
+    // Idx must be a 32-bit register and it is storing 4 2-bit indexes of A's non zero elements.
+    // It starts with last two elements of every 4 elements subgroup set as non-zero
+    int32_t idx = 0b11101110;
+    // Bit masks are for zeroing 0-3rd position of idx
+    static constexpr int32_t bit_clear_masks[4] = {0b11, 0b1100, 0b110000, 0b11000000};
+    src1_t curr_val;
+    int32_t a_pos = 0;
+    for(int j = 0; j < 2; ++j)
+    {
+        a_pos = j * 2;
+        for(int i = 0; i < 4; ++i)
+        {
+            curr_val = a_shared[(lane % M) * K + (lane / M) * 8 + 4 * j + i];
+            if(curr_val != 0.0f)
+            {
+                idx &= ~bit_clear_masks[a_pos];
+                idx |= (i % 4) << 2 * a_pos;
+                a_frag[a_pos] = curr_val;
+                a_pos++;
+            }
+        }
+    }
+    for(int i = 0; i < 8; ++i)
+    {
+        b_frag[i] = b_shared[(8 * (lane / N) + i) * N + (lane % N)];
+    }
+    builtin_smfmac_naive_selector<src1_vec, src2_vec, acc_vec>(a_frag, b_frag, idx, c_thread_buf_);
+    __syncthreads();
+    // store results from unpacked c_thread_buf_ output
+    if constexpr(K == 32)
+    {
+        static_for<0, acc_vec_size, 1>{}([&](auto i) {
+            c[(4 * (lane / 16) + i) * N + lane % 16] =
+                ck::type_convert<dst_t>(c_thread_buf_[Number<i>{}]);
+        });
+    }
+    else
+    {
+        static_for<0, acc_vec_size, 1>{}([&](auto i) {
+            c[((8 * (i / 4)) % 32 + 4 * (lane / 32) + i % 4) * N + lane % 32] =
+                ck::type_convert<dst_t>(c_thread_buf_[Number<i>{}]);
+        });
+    }
+}
+struct GemmParams
+{
+    GemmParams() : M(16), N(16), K(32), StrideA(32), StrideB(16), StrideC(16), alpha(1), beta(0) {}
+    ck::index_t M;
+    ck::index_t N;
+    ck::index_t K;
+    ck::index_t StrideA;
+    ck::index_t StrideB;
+    ck::index_t StrideC;
+    float alpha;
+    float beta;
+};
+template <typename GemmInstance,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void RunHostGEMM(const Tensor<ADataType>& A,
+                 const Tensor<BDataType>& B,
+                 Tensor<CDataType>& C,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+{
+    auto ref_gemm     = GemmInstance{};
+    auto ref_invoker  = ref_gemm.MakeInvoker();
+    auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
+    ref_invoker.Run(ref_argument);
+}
+template <typename KernelType, typename ADataType, typename BDataType, typename CDataType>
+bool RunDeviceGEMM(KernelType kernel,
+                   const Tensor<ADataType>& A,
+                   const Tensor<BDataType>& B,
+                   Tensor<CDataType>& C)
+{
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize());
+    DeviceMem b_n_k_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize());
+    a_m_k_device_buf.ToDevice(A.mData.data());
+    b_n_k_device_buf.ToDevice(B.mData.data());
+    kernel<<<1, 64>>>(static_cast<const ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                      static_cast<const BDataType*>(b_n_k_device_buf.GetDeviceBuffer()),
+                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()));
+    c_m_n_device_buf.FromDevice(C.mData.data());
+    return true;
+}
+template <typename DeviceSmfmac,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GPUAccDataType,
+          typename CPUAccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t CAccNum,
+          index_t M,
+          index_t N,
+          index_t K>
+struct TestSmfmac
+{
+    auto PrepareGemmTensor(const ck::smfmac_op_util::GemmParams& params)
+    {
+        auto f_host_tensor_descriptor =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({stride, 1}));
+                }
+                else
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({1, stride}));
+                }
+            };
+        Tensor<ADataType> a_m_k(
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+        Tensor<BDataType> b_n_k(
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        Tensor<CDataType> c_m_n_host_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        Tensor<CDataType> c_m_n_device_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        auto f_generate_tensor_value = [](auto& tensor, auto type) {
+            using dataType = decltype(type);
+            tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
+        };
+        f_generate_tensor_value(a_m_k, ADataType{});
+        f_generate_tensor_value(b_n_k, BDataType{});
+        ck::utils::TransformIntoStructuralSparsity<ADataType>{}(a_m_k);
+        return std::make_tuple(a_m_k, b_n_k, c_m_n_host_result, c_m_n_device_result);
+    }
+    auto operator()(const DeviceSmfmac& smfmac_kernel)
+    {
+        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
+                  << ", CLayout = " << CLayout{}.name << std::endl;
+        // Arrange
+        ck::smfmac_op_util::GemmParams params;
+        params.M       = M;
+        params.N       = N;
+        params.K       = K;
+        params.StrideA = K; // M K
+        params.StrideB = N; // K N
+        params.StrideC = N; // M N
+        auto host_tensors = PrepareGemmTensor(params);
+        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
+        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
+        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
+        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
+        auto a_element_op = AElementwiseOperation{};
+        auto b_element_op = BElementwiseOperation{};
+        auto c_element_op = CElementwiseOperation{};
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                      BDataType,
+                                                      CDataType,
+                                                      CPUAccDataType,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation>;
+        ck::smfmac_op_util::RunHostGEMM<ReferenceGemmInstance>(
+            a, b, c_host, a_element_op, b_element_op, c_element_op);
+        // Act
+        bool is_supported = ck::smfmac_op_util::RunDeviceGEMM(smfmac_kernel, a, b, c_device);
+        if(is_supported)
+        {
+            // Assert
+            bool res = false;
+            if(std::is_same<CDataType, float>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else
+            {
+                std::cout << "UNSUPPORTED CDataType" << std::endl;
+            }
+            return res;
+        }
+        else
+        {
+            return true;
+        }
+    }
+};
+} // namespace smfmac_op_util
+} // namespace ck
--- a/test/smfmac_op/smfmac_op_xdl.cpp
+++ b/test/smfmac_op/smfmac_op_xdl.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+#include "ck/ck.hpp"
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "test/smfmac_op/smfmac_op_util.hpp"
+using BF16        = ck::bhalf_t;
+using F16         = ck::half_t;
+using F32         = float;
+using Row         = ck::tensor_layout::gemm::RowMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+template <typename Tuple>
+class TestSmfmac : public ::testing::Test
+{
+    protected:
+    using Src1Type                           = std::tuple_element_t<0, Tuple>;
+    static constexpr ck::index_t Src1VecSize = std::tuple_element_t<1, Tuple>{}.value;
+    using Src2Type                           = std::tuple_element_t<2, Tuple>;
+    static constexpr ck::index_t Src2VecSize = std::tuple_element_t<3, Tuple>{}.value;
+    using DstType                            = std::tuple_element_t<4, Tuple>;
+    static constexpr ck::index_t AccVecSize  = std::tuple_element_t<5, Tuple>{}.value;
+    using GPUAccType                         = std::tuple_element_t<6, Tuple>;
+    using CPUAccType                         = std::tuple_element_t<7, Tuple>;
+    static constexpr ck::index_t M           = std::tuple_element_t<8, Tuple>{}.value;
+    static constexpr ck::index_t N           = std::tuple_element_t<9, Tuple>{}.value;
+    static constexpr ck::index_t K           = std::tuple_element_t<10, Tuple>{}.value;
+    void Run()
+    {
+        bool pass                     = true;
+        constexpr auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
+                                                                   Src1VecSize,
+                                                                   Src2Type,
+                                                                   Src2VecSize,
+                                                                   GPUAccType,
+                                                                   AccVecSize,
+                                                                   DstType,
+                                                                   M,
+                                                                   N,
+                                                                   K>;
+        constexpr auto smfmac_kernel_container = std::make_tuple(matmul_default);
+        ck::static_for<0, std::tuple_size_v<decltype(smfmac_kernel_container)>, 1>{}([&](auto i) {
+            pass &= ck::smfmac_op_util::TestSmfmac<
+                std::tuple_element_t<i.value, decltype(smfmac_kernel_container)>,
+                Src1Type,
+                Src2Type,
+                DstType,
+                GPUAccType,
+                CPUAccType,
+                decltype(Row{}),
+                decltype(Row{}),
+                decltype(Row{}),
+                PassThrough,
+                PassThrough,
+                PassThrough,
+                AccVecSize,
+                M,
+                N,
+                K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
+        });
+        EXPECT_TRUE(pass);
+    }
+};
+template <ck::index_t N>
+using I = ck::Number<N>;
+using KernelTypes =
+    ::testing::Types<std::tuple<F16, I<4>, F16, I<8>, F32, I<4>, F32, F32, I<16>, I<16>, I<32>>,
+                     std::tuple<BF16, I<4>, BF16, I<8>, F32, I<4>, F32, F32, I<16>, I<16>, I<32>>,
+                     std::tuple<F16, I<4>, F16, I<8>, F32, I<16>, F32, F32, I<32>, I<32>, I<16>>,
+                     std::tuple<BF16, I<4>, BF16, I<8>, F32, I<16>, F32, F32, I<32>, I<32>, I<16>>>;
+TYPED_TEST_SUITE(TestSmfmac, KernelTypes);
+TYPED_TEST(TestSmfmac, TestSmfmacFP16BF16) { this->Run(); }
--- a/test/wmma_op/wmma_op_util.hpp
+++ b/test/wmma_op/wmma_op_util.hpp
@@ -11,6 +11,7 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/utility/amd_wmma.hpp"
+#include "ck/host_utility/device_prop.hpp"
 namespace ck {
 namespace wmma_op_util {
@@ -140,10 +141,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
        p_shared[8 * 16 * lane_hi + 8 * lane_lo + ele + 16 * 16] = b_temp[ele];
    }
+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif
    for(int ele = 0; ele < 16; ++ele)
    {
@@ -155,10 +164,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
        a_frag[ele] = p_shared[(ele / 8) * 16 * 8 + 8 * lane + ele % 8];
    }
+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif
    // sync threads, similar to mma_sync
    // __syncthreads();
@@ -357,7 +374,8 @@ struct TestWmma
            a, b, c_host, a_element_op, b_element_op, c_element_op);
        // Act
-        bool is_supported = ck::wmma_op_util::RunDeviceGEMM(wmma_kernel, a, b, c_device);
+        bool is_supported = ck::is_gfx11_supported() &&
+                            ck::wmma_op_util::RunDeviceGEMM(wmma_kernel, a, b, c_device);
        if(is_supported)
        {