Merge branch 'develop' into universal_streamk

e553bcf1 · Harisankar Sadasivan · GitHub · 31e104ca · eaa870a1 · e553bcf1
Unverified Commit e553bcf1 authored Jul 04, 2024 by Harisankar Sadasivan Committed by GitHub Jul 04, 2024
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -886,10 +886,10 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1100;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx908;gfx90a" \
+                                           -DGPU_TARGETS="gfx1100;gfx90a" \
                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                    }

--- a/client_example/25_wrapper/wrapper_basic_gemm.cpp
+++ b/client_example/25_wrapper/wrapper_basic_gemm.cpp
@@ -7,19 +7,23 @@
 #include <initializer_list>
 #include <vector>
+#include "ck/utility/common_header.hpp"
+// __gfx9__ defined in the above header via ck.hpp
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/host_tensor.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
 #include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
 #include "ck/wrapper/layout.hpp"
 #include "ck/wrapper/tensor.hpp"
 #include "ck/wrapper/operations/copy.hpp"
 #include "ck/wrapper/operations/gemm.hpp"
 #include "ck/wrapper/utils/kernel_utils.hpp"
+#include "ck/host_utility/device_prop.hpp"
 struct SimpleDeviceMem
 {
@@ -204,6 +208,14 @@ void PerformGemm(const ck::index_t M,
 int main(int argc, char* argv[])
 {
+    bool is_supported = ck::is_xdl_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: xdl example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
    using DataType = ck::half_t;
    const auto thread_layout =
        ck::wrapper::make_layout(ck::make_tuple(ck::Number<64>{}, ck::Number<4>{}),
@@ -213,3 +225,4 @@ int main(int argc, char* argv[])
        3840, 4096, 4096, tile_shape, thread_layout);
    return 0;
 }
+#endif
--- a/client_example/25_wrapper/wrapper_optimized_gemm.cpp
+++ b/client_example/25_wrapper/wrapper_optimized_gemm.cpp
@@ -7,18 +7,21 @@
 #include <initializer_list>
 #include <vector>
-#include "ck/library/utility/host_tensor.hpp"
+#include "ck/utility/common_header.hpp"
+// __gfx9__ defined in the above header via ck.hpp
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
 #include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
 #include "ck/wrapper/layout.hpp"
 #include "ck/wrapper/tensor.hpp"
 #include "ck/wrapper/operations/copy.hpp"
 #include "ck/wrapper/operations/gemm.hpp"
 #include "ck/wrapper/utils/kernel_utils.hpp"
+#include "ck/host_utility/device_prop.hpp"
 struct SimpleDeviceMem
 {
@@ -296,6 +299,14 @@ void PerformGemm(const ck::index_t M,
 int main(int argc, char* argv[])
 {
+    bool is_supported = ck::is_xdl_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: xdl example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
    using DataType = ck::half_t;
    const auto thread_layout =
        ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
@@ -305,3 +316,4 @@ int main(int argc, char* argv[])
        3840, 4096, 4096, tile_shape, thread_layout);
    return 0;
 }
+#endif
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -17,6 +17,7 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
+#include "ck/host_utility/device_prop.hpp"
 struct AlphaBetaAdd
 {
@@ -175,6 +176,14 @@ int main(int argc, char* argv[])
        exit(0);
    }
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            using namespace ck::literals;

--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
@@ -17,6 +17,7 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
+#include "ck/host_utility/device_prop.hpp"
 struct AlphaBetaAdd
 {
@@ -175,6 +176,14 @@ int main(int argc, char* argv[])
        exit(0);
    }
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            using namespace ck::literals;

--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "common_wmma.hpp"
+#include "ck/host_utility/device_prop.hpp"
 // kernel data types
 using InKernelDataType       = FP16;
@@ -23,4 +24,14 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 #include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
-int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv);
+}
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "common_wmma.hpp"
+#include "ck/host_utility/device_prop.hpp"
 // kernel data types
 using InKernelDataType       = I8;
@@ -23,4 +24,14 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 #include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
-int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv);
+}
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/host_utility/device_prop.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -163,4 +164,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
 #include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc"
-int main(int argc, char* argv[]) { return run(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run(argc, argv);
+}
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp
@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/host_utility/device_prop.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -285,4 +286,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
 #include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc"
-int main(int argc, char* argv[]) { return run(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run(argc, argv);
+}
--- a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp
@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/host_utility/device_prop.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -351,4 +352,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
 #include "run_cross_attention_wmma.inc"
-int main(int argc, char* argv[]) { return run(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run(argc, argv);
+}
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp
@@ -28,6 +28,7 @@ Example is GQA-4
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/host_utility/device_prop.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -299,4 +300,14 @@ using ReferenceGemm1Instance =
 #include "run_grouped_query_attention_forward_wmma.inc"
-int main(int argc, char* argv[]) { return run(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run(argc, argv);
+}
--- a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp
@@ -26,6 +26,7 @@ Shazeer, Noam. “Fast Transformer Decoding: One Write-Head Is All You Need.”
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/host_utility/device_prop.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -284,4 +285,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm_
 #include "run_multi_query_attention_forward_wmma.inc"
-int main(int argc, char* argv[]) { return run(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run(argc, argv);
+}
--- a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp
@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/host_utility/device_prop.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -329,4 +330,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
 #include "run_self_attention_wmma.inc"
-int main(int argc, char* argv[]) { return run(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run(argc, argv);
+}
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_wmma_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_wmma_fp16.cpp
@@ -3,6 +3,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
 #include "common.hpp"
+#include "ck/host_utility/device_prop.hpp"
 using OutDataType      = FP16;
 using WeiDataType      = FP16;
@@ -31,4 +32,14 @@ using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDat
 #include "run_grouped_conv_bwd_data_example.inc"
-int main(int argc, char* argv[]) { return run_grouped_conv_bwd_data_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    bool is_supported = ck::is_gfx11_supported();
+    if(!is_supported)
+    {
+        std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
+                  << std::endl;
+        return 0;
+    }
+    return run_grouped_conv_bwd_data_example(argc, argv);
+}
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -47,12 +47,12 @@ __global__ void
 #endif
        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
            typename GridwiseGemm::Argument karg,
-            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const index_t num_k_per_block)
+            [[maybe_unused]] const index_t num_k_per_block)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx94__))
@@ -103,12 +103,12 @@ __global__ void
 #endif
        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
            typename GridwiseGemm::Argument karg,
-            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const index_t num_k_per_block)
+            [[maybe_unused]] const index_t num_k_per_block)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -69,14 +69,15 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_fwd_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg,
+        kernel_grouped_conv_fwd_xdl_cshuffle_v3(
-                                                const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            typename GridwiseGemm::Argument karg,
-                                                const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-                                                const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                const ComputePtrOffset compute_ptr_offset_of_groups,
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-                                                const ComputePtrOffset compute_ptr_offset_of_n,
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
-                                                const index_t groups_count)
+            [[maybe_unused]] const index_t groups_count)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    // offset base pointer for each work-group
@@ -132,13 +133,13 @@ __global__ void
 #endif
        kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds(
            typename GridwiseGemm::Argument karg,
-            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ComputePtrOffset compute_ptr_offset_of_groups,
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-            const ComputePtrOffset compute_ptr_offset_of_n,
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
-            const index_t groups_count)
+            [[maybe_unused]] const index_t groups_count)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    // offset base pointer for each work-group

--- a/include/ck/tensor_operation/gpu/warp/smfmac_xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/smfmac_xdlops_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/utility/amd_smfmac.hpp"
+namespace ck {
+enum struct SmfmacInstr
+{
+    smfmac_f32_16x16x32f16 = 0,
+    smfmac_f32_32x32x16f16,
+    smfmac_f32_16x16x32bf16,
+    smfmac_f32_32x32x16bf16,
+};
+template <SmfmacInstr instr>
+struct smfmac_type;
+template <>
+struct smfmac<SmfmacInstr::smfmac_f32_16x16x32f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, const int32_t& idx, FloatC& reg_c) const
+    {
+        intrin_smfmac_f32_16x16x32f16<MPerXdlops, NPerXdlops>::Run(a, b, idx, reg_c);
+    }
+};
+template <>
+struct smfmac<SmfmacInstr::smfmac_f32_32x32x16f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 16;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, const int32_t& idx, FloatC& reg_c) const
+    {
+        intrin_smfmac_f32_32x32x16f16<MPerXdlops, NPerXdlops>::Run(a, b, idx, reg_c);
+    }
+};
+template <>
+struct smfmac<SmfmacInstr::smfmac_f32_16x16x32bf16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, const int32_t& idx, FloatC& reg_c) const
+    {
+        intrin_smfmac_f32_16x16x32bf16<MPerXdlops, NPerXdlops>::Run(a, b, idx, reg_c);
+    }
+};
+template <>
+struct smfmac<SmfmacInstr::smfmac_f32_32x32x16bf16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 16;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, const int32_t& idx, FloatC& reg_c) const
+    {
+        intrin_smfmac_f32_32x32x16bf16<MPerXdlops, NPerXdlops>::Run(a, b, idx, reg_c);
+    }
+};
+template <typename base_type,
+          index_t MPerXdlops,
+          index_t NPerXdlops,
+          typename additional_type = base_type>
+struct SmfmacSelector
+{
+    template <typename base_type_,
+              index_t MPerXdlops_,
+              index_t NPerXdlops_,
+              typename additional_type_ = base_type_>
+    static constexpr auto GetSmfmac();
+    template <>
+    static constexpr auto GetSmfmac<half_t, 16, 16>()
+    {
+        return SmfmacInstr::smfmac_f32_16x16x32f16;
+    }
+    template <>
+    static constexpr auto GetSmfmac<half_t, 32, 32>()
+    {
+        return SmfmacInstr::smfmac_f32_32x32x16f16;
+    }
+    template <>
+    static constexpr auto GetSmfmac<bhalf_t, 16, 16>()
+    {
+        return SmfmacInstr::smfmac_f32_16x16x32bf16;
+    }
+    template <>
+    static constexpr auto GetSmfmac<bhalf_t, 32, 32>()
+    {
+        return SmfmacInstr::smfmac_f32_32x32x16bf16;
+    }
+    static constexpr auto selected_smfmac =
+        smfmac_type<GetSmfmac<base_type, MPerXdlops, NPerXdlops, additional_type>()>{};
+    __host__ __device__ constexpr SmfmacSelector()
+    {
+        static_assert(selected_smfmac.group_size * selected_smfmac.num_groups_per_blk ==
+                          selected_smfmac.num_regs_per_blk,
+                      "wrong! num_regs_per_blk");
+        static_assert(selected_smfmac.num_threads_per_blk == selected_smfmac.n_per_blk,
+                      "n_per_blk != num_threads_per_blk");
+        static_assert(selected_smfmac.num_regs_per_blk * selected_smfmac.num_input_blks ==
+                          selected_smfmac.m_per_blk,
+                      "m_per_blk != num_input_blks * num_regs_per_blk");
+        static_assert(selected_smfmac.num_output_blks == selected_smfmac.num_input_blks ||
+                          selected_smfmac.num_output_blks == 1,
+                      "incorrect num_output_blks");
+        static_assert(selected_smfmac.num_regs_per_blk * selected_smfmac.wave_size ==
+                          selected_smfmac.m_per_blk * selected_smfmac.n_per_blk,
+                      "num_regs_per_blk incorrect");
+        static_assert(selected_smfmac.is_k_reduction ||
+                          (selected_smfmac.num_input_blks == selected_smfmac.num_output_blks),
+                      "is_k_reduction wrong!");
+    }
+    static constexpr index_t GetKPerXdlops()
+    {
+        return (selected_smfmac.is_k_reduction ? selected_smfmac.num_input_blks : 1) *
+               selected_smfmac.k_per_blk;
+    }
+    static constexpr index_t GetK1PerXdlops() { return selected_smfmac.k_per_blk; }
+};
+template <typename base_type,
+          index_t MPerXdlops,
+          index_t NPerXdlops,
+          index_t KPack,
+          typename additional_type = base_type>
+struct SparseXdlopsGemm
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    using CIndex   = MultiIndex<2>;
+    using CIndex4D = MultiIndex<4>;
+    __device__ static constexpr index_t GetNumBlks() { return smfmac_instr.num_output_blks; }
+    __device__ static constexpr index_t GetNumXdlops()
+    {
+        return MPerXdlops * NPerXdlops /
+               (smfmac_instr.m_per_blk * smfmac_instr.n_per_blk * smfmac_instr.num_output_blks);
+    }
+    __host__ __device__ constexpr SparseXdlopsGemm()
+    {
+        static_assert(NPerXdlops == 16 || NPerXdlops == 32,
+                      "Only support GemmNPerXdlops == 16 or 32 for smfmac xdlops");
+        static_assert(MPerXdlops == 16 || MPerXdlops == 32,
+                      "Only support GemmMPerXdlops == 16 or 32 for smfmac xdlops");
+        static_assert(KPack % smfmac_instr.k_per_blk == 0, "KPack cannot be divided by k_per_blk");
+    }
+    // XDL output supporting C = A * B
+    // M2_N2 -> M2_M3_M4_N2
+    template <typename CDesc_M0_N0_M1_N1_M2_N2>
+    __host__ __device__ static constexpr auto
+    MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
+    {
+        const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        return transform_tensor_descriptor(
+            c_desc_m0_n0_m1_n1_m2_n2,
+            make_tuple(make_pass_through_transform(M0),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(M1),
+                       make_pass_through_transform(N1),
+                       make_unmerge_transform(make_tuple(Number<smfmac_instr.num_groups_per_blk>{},
+                                                         Number<smfmac_instr.num_input_blks>{},
+                                                         Number<smfmac_instr.group_size>{})),
+                       make_pass_through_transform(Number<smfmac_instr.num_threads_per_blk>{})),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4, 5, 6>{},
+                       Sequence<7>{}));
+    }
+    template <typename CDesc_G_M0_N0_M1_N1_M2_N2>
+    __host__ __device__ static constexpr auto MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+        const CDesc_G_M0_N0_M1_N1_M2_N2& c_desc_g_m0_n0_m1_n1_m2_n2)
+    {
+        const auto G  = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto M0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto N0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto M1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        const auto N1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I4);
+        return transform_tensor_descriptor(
+            c_desc_g_m0_n0_m1_n1_m2_n2,
+            make_tuple(make_pass_through_transform(G),
+                       make_pass_through_transform(M0),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(M1),
+                       make_pass_through_transform(N1),
+                       make_unmerge_transform(make_tuple(smfmac_instr.num_groups_per_blk,
+                                                         smfmac_instr.num_input_blks,
+                                                         smfmac_instr.group_size)),
+                       make_pass_through_transform(smfmac_instr.num_threads_per_blk)),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{},
+                       Sequence<6>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6, 7>{},
+                       Sequence<8>{}));
+    }
+    __device__ static constexpr index_t GetRegSizePerXdlops()
+    {
+        return MPerXdlops * NPerXdlops / smfmac_instr.wave_size;
+    }
+    __device__ static constexpr index_t GetWaveSize() { return smfmac_instr.wave_size; }
+    template <class FloatA, class FloatB, class Idx, class FloatC>
+    __device__ void
+    Run(const FloatA& p_a_wave, const FloatB& p_b_wave, const Idx& idx, FloatC& p_c_thread) const
+    {
+        static_assert(is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value,
+                      "base base_type must be half or bfloat16!");
+        static_for<0, KPack / smfmac_instr.k_per_blk, 1>{}([&](auto k) {
+            smfmac_instr.template run<MPerXdlops, NPerXdlops>(
+                p_a_wave[k], p_b_wave[k], idx[k], p_c_thread);
+        });
+    }
+    __device__ static auto GetLaneId() { return get_thread_local_1d_id() % smfmac_instr.wave_size; }
+    __device__ static auto GetBlkIdx()
+    {
+        const auto laneId = GetLaneId();
+        constexpr auto threadidx_to_blk_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(
+                make_tuple(1, smfmac_instr.num_input_blks, smfmac_instr.num_threads_per_blk))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+        const auto blk_idx =
+            threadidx_to_blk_idx_adaptor.CalculateBottomIndex(make_multi_index(laneId));
+        const auto blk_id = blk_idx[I1];
+        const auto blk_td = blk_idx[I2];
+        return make_tuple(blk_id, blk_td);
+    }
+    __host__ __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto laneId  = GetLaneId();
+        const auto blk_idx = GetBlkIdx();
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+        if constexpr(smfmac_instr.is_k_reduction)
+        {
+            return make_tuple(blk_id, blk_td);
+        }
+        else
+        {
+            return make_tuple(0, laneId);
+        }
+    }
+    __host__ __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto laneId  = GetLaneId();
+        const auto blk_idx = GetBlkIdx();
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+        if constexpr(smfmac_instr.is_k_reduction)
+        {
+            return make_tuple(blk_id, blk_td);
+        }
+        else
+        {
+            return make_tuple(0, laneId);
+        }
+    }
+    __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i)
+    {
+        const auto blk_idx = GetBlkIdx();
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+        index_t n_offset = blk_i * smfmac_instr.n_per_blk + blk_td;
+        index_t m_offset = xdlops_i * smfmac_instr.m_per_blk + blk_id * smfmac_instr.group_size;
+        return CIndex{m_offset, n_offset};
+    }
+    __device__ static CIndex4D GetBeginOfThreadBlk4D(index_t /* xdlops_i */, index_t /* blk_i */)
+    {
+        const auto blk_idx = GetBlkIdx();
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+        return CIndex4D{I0, blk_id, I0, blk_td};
+    }
+    static constexpr auto smfmac =
+        SmfmacSelector<base_type, MPerXdlops, NPerXdlops, additional_type>{};
+    static constexpr auto smfmac_instr = smfmac.selected_smfmac;
+    static constexpr auto KPerXdlops  = smfmac.GetKPerXdlops();
+    static constexpr auto K1PerXdlops = smfmac.GetK1PerXdlops();
+    static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
+    __host__ __device__ static constexpr auto GetCM0M1M2NThreadBlkLengths()
+    {
+        return make_tuple(
+            Number<smfmac_instr.num_groups_per_blk>{}, I1, Number<smfmac_instr.group_size>{}, I1);
+    }
+};
+} // namespace ck
--- a/include/ck/utility/amd_smfmac.hpp
+++ b/include/ck/utility/amd_smfmac.hpp
@@ -16,8 +16,15 @@ struct intrin_smfmac_f32_16x16x32f16<16, 16>
    __device__ static void
    Run(const half4_t& reg_a, const half8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
    {
+#if defined(__gfx94__)
        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_16x16x32_f16(
            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
    }
 };
@@ -31,8 +38,15 @@ struct intrin_smfmac_f32_16x16x32bf16<16, 16>
    __device__ static void
    Run(const bhalf4_t& reg_a, const bhalf8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
    {
+#if defined(__gfx94__)
        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(
            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
    }
 };
@@ -46,8 +60,15 @@ struct intrin_smfmac_f32_32x32x16f16<32, 32>
    __device__ static void
    Run(const half4_t& reg_a, const half8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
    {
+#if defined(__gfx94__)
        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_32x32x16_f16(
            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
    }
 };
@@ -61,8 +82,15 @@ struct intrin_smfmac_f32_32x32x16bf16<32, 32>
    __device__ static void
    Run(const bhalf4_t& reg_a, const bhalf8_t& reg_b, const int32_t& reg_idx, FloatC& reg_c)
    {
+#if defined(__gfx94__)
        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(
            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], reg_idx, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+        ignore = reg_idx;
+#endif
    }
 };

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -71,6 +71,8 @@ function(add_test_executable TEST_NAME)
             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+        elseif(ARGN MATCHES "_smfmac")
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -150,6 +152,8 @@ function(add_gtest_executable TEST_NAME)
             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+        elseif(ARGN MATCHES "_smfmac")
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -209,7 +213,7 @@ add_subdirectory(wrapper)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
-if(GPU_TARGETS MATCHES "gfx942")
+if(GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2
    add_subdirectory(smfmac_op)
 endif()
 add_subdirectory(position_embedding)
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -2,11 +2,11 @@ add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_x
 if(result EQUAL 0)
    target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
-add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_xdl.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+    target_link_libraries(test_grouped_convnd_bwd_data_interface_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance)
 endif()
-add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_wmma.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_data_interface_wmma test_grouped_convnd_bwd_data_interface_wmma.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+    target_link_libraries(test_grouped_convnd_bwd_data_interface_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance)
 endif()