enabled other types

7d700bc0 · Jing Zhang · 54793dfd · 7d700bc0 · 7d700bc0 · 7d700bc0
Commit 7d700bc0 authored Apr 03, 2024 by Jing Zhang
18 changed files
--- a/example/02_gemm_bilinear/CMakeLists.txt
+++ b/example/02_gemm_bilinear/CMakeLists.txt
-list(APPEND gpu_list1 gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list1 gfx1100 gfx1101 gfx1102 gfx1200)
 list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)

--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1200)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)

--- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
 add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)
-if(GPU_TARGETS MATCHES "gfx11")
+if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
    add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp)
 endif()
--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx11")
+if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
    add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp)
    add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp)
    add_example_executable(example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp)

--- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1200)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)

--- a/example/64_fpAintB_gemm/CMakeLists.txt
+++ b/example/64_fpAintB_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx11")
+if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
  add_custom_target(example_fpAintB_gemm_wmma)
  add_example_executable(example_fp16int8_gemm_wmma fp16int8_gemm_wmma.cpp)
  add_dependencies(example_fpAintB_gemm_wmma example_fp16int8_gemm_wmma)

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -137,8 +137,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
    static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true;
    // If true, LDS is used unconditionally
-    static constexpr auto AEnableLds_manu = false;
+    static constexpr auto AEnableLds_manu = true;
-    static constexpr auto BEnableLds_manu = false;
+    static constexpr auto BEnableLds_manu = true;
    static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
    static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);
@@ -829,7 +829,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
@@ -56,7 +56,7 @@ __global__ void
                                                       bool input_permute,
                                                       bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
    // clang-format off
 // ***************************************************
@@ -159,6 +159,7 @@ __global__ void
    ignore = O;
    ignore = G0;
    ignore = G1;
+    ignore = alpha;
    ignore = input_permute;
    ignore = output_permute;
 #endif // end of if (defined(__gfx11__))
@@ -187,7 +188,7 @@ __global__ void
                                           index_t head_size,
                                           float alpha)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
    // clang-format off
 // ***************************************************
@@ -561,7 +562,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
    static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true;
    static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true;
-    static constexpr auto AEnableLds_manu  = false;
+    static constexpr auto AEnableLds_manu  = true;
    static constexpr auto B0EnableLds_manu = true;
    static constexpr auto B1EnableLds_manu = true;
@@ -858,7 +859,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
    static bool IsSupportedArgument(const RawArg& arg)
    {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
@@ -509,7 +509,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, ck::half_t> ||
                           is_same_v<AccDataType, int32_t>))

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
@@ -515,7 +515,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -629,7 +629,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
    static bool IsSupportedArgument(const Argument& arg)
    {
        // check device
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
@@ -702,7 +702,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle
    static bool IsSupportedArgument(const Argument& arg)
    {
        // check device
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
@@ -61,7 +61,7 @@ __global__ void
                                            bool input_permute,
                                            bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
    // clang-format off
 // ***************************************************
@@ -166,6 +166,7 @@ __global__ void
    ignore = O;
    ignore = G0;
    ignore = G1;
+    ignore = alpha;
    ignore = input_permute;
    ignore = output_permute;
 #endif // end of if (defined(__gfx11__))
@@ -299,7 +300,7 @@ struct DeviceGroupedQueryAttentionForward_Wmma
    static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true;
    static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true;
-    static constexpr auto AEnableLds_manu  = false;
+    static constexpr auto AEnableLds_manu  = true;
    static constexpr auto B0EnableLds_manu = true;
    static constexpr auto B1EnableLds_manu = true;
@@ -596,7 +597,7 @@ struct DeviceGroupedQueryAttentionForward_Wmma
    static bool IsSupportedArgument(const RawArg& arg)
    {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
@@ -60,7 +60,7 @@ __global__ void
                                          bool input_permute,
                                          bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx11__))
    // clang-format off
 // ***************************************************
@@ -165,6 +165,7 @@ __global__ void
    ignore = O;
    ignore = G0;
    ignore = G1;
+    ignore = alpha;
    ignore = input_permute;
    ignore = output_permute;
 #endif // end of if (defined(__gfx11__))
@@ -594,7 +595,7 @@ struct DeviceMultiQueryAttentionForward_Wmma
    static bool IsSupportedArgument(const RawArg& arg)
    {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
@@ -333,7 +333,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
                // AK0_M_AK1 -> AK0_MRepeat_Mwaves_MPerWmma_AK1
                constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
                constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
-                constexpr auto A_KRow = I1;
+                // constexpr auto A_KRow = I1;
+                constexpr auto A_KRow = I2;
                return transform_tensor_descriptor(
                    ABlockDesc_{},
                    make_tuple(make_unmerge_transform(make_tuple(Number<A_K0>{}, A_KRow)),
@@ -373,7 +374,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
                // BK0_L_BK1 -> BK0_LRepeat_Lwaves_LPerWmma_BK1
                constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0);
                constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2);
-                constexpr auto B_KRow = I1;
+                // constexpr auto B_KRow = I1;
+                constexpr auto B_KRow = I2;
                return transform_tensor_descriptor(
                    B0BlockDesc_{},
                    make_tuple(make_unmerge_transform(make_tuple(Number<B_K0>{}, B_KRow)),
@@ -410,7 +412,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
    {
        constexpr index_t A_L0 = A1BlockDesc_AL0_M_AL1{}.GetLength(I0);
        constexpr index_t A_L1 = A1BlockDesc_AL0_M_AL1{}.GetLength(I2);
-        constexpr auto A_LRow  = I1;
+        // constexpr auto A_LRow  = I1;
+        constexpr auto A_LRow = I2;
        return transform_tensor_descriptor(
            A1BlockDesc_AL0_M_AL1{},
            make_tuple(make_unmerge_transform(make_tuple(Number<A_L0>{}, A_LRow)),
@@ -430,7 +433,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
                // BL0_N_BL1 -> BL0_NRepeat_Nwaves_NPerWmma_BL1
                constexpr auto B_L0 = B1BlockDesc_{}.GetLength(I0);
                constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I2);
-                constexpr auto B_LRow = I1;
+                // constexpr auto B_LRow = I1;
+                constexpr auto B_LRow = I2;
                return transform_tensor_descriptor(
                    B1BlockDesc_{},
                    make_tuple(make_unmerge_transform(make_tuple(Number<B_L0>{}, B_LRow)),
@@ -1179,7 +1183,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
                              MRepeat,
                              NRepeat,
                              KPack,
-                              false,
+                              true,
                              B1EnableLds,
                              true>{make_tuple(0, 0, 0, 0, 0, 0)};
@@ -1342,7 +1346,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
                        block_sync_lds();
-                        blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
+                        //blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
                        block_sync_lds();
@@ -1365,7 +1369,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
                    block_sync_lds();
-                    blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
+                    //blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
                }
            } // end gemm1

--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
@@ -50,8 +50,7 @@ __global__ void
                                 const CElementwiseOperation c_element_op,
                                 const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
-    defined(__gfx1102__))
    __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -20,6 +20,8 @@ enum struct WmmaInstr
    wmma_i32_16x16x16_iu4,
    // gfx12
    wmma_f32_16x16x16_f16_gfx12,
+    wmma_f32_16x16x16_bf16_gfx12,
+    wmma_i32_16x16x16_iu8_gfx12,
 };
 /*
@@ -121,46 +123,6 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16,
    }
 };
-// A-swizzled
-template <index_t WaveSize>
-struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
-                 WaveSize,
-                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
-{
-    // Absolute fixing property
-    // * Data Pixel
-    static constexpr index_t m_per_wmma = 16;
-    static constexpr index_t n_per_wmma = 16;
-    static constexpr index_t k_per_wmma = 16;
-    // static constexpr index_t src_a_data_size = 2;
-    // static constexpr index_t src_b_data_size = 2;
-    // static constexpr index_t acc_data_size   = 4;
-    // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction
-    static constexpr index_t acc_data_size            = 4;
-    static constexpr index_t acc_pack_number          = 1;
-    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
-    // Wave mode dependent propety
-    static constexpr index_t wave_size = Number<WaveSize>{};
-    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
-    // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
-    // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
-    // * num_acc_vgprs_per_wave alone M direction
-    // * num_subgroups alone M direction
-    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
-    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
-    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
-    {
-        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
-        if constexpr(wave_size == 32)
-        {
-            intrin_wmma_f32_16x16x16_f16_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
-        }
-    }
-};
 template <index_t WaveSize>
 struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf16,
                 WaveSize,
@@ -322,6 +284,122 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8,
    }
 };
+// gfx12
+// A-swizzled
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    // * Data Pixel
+    static constexpr index_t m_per_wmma = 16;
+    static constexpr index_t n_per_wmma = 16;
+    static constexpr index_t k_per_wmma = 16;
+    // static constexpr index_t src_a_data_size = 2;
+    // static constexpr index_t src_b_data_size = 2;
+    // static constexpr index_t acc_data_size   = 4;
+    // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+    // Wave mode dependent propety
+    static constexpr index_t wave_size = Number<WaveSize>{};
+    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
+    // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
+    // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
+    // * num_acc_vgprs_per_wave alone M direction
+    // * num_subgroups alone M direction
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+            intrin_wmma_f32_16x16x16_f16_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+        }
+    }
+};
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf16_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma = 16;
+    static constexpr index_t n_per_wmma = 16;
+    static constexpr index_t k_per_wmma = 16;
+    // static constexpr index_t src_a_data_size          = 2;
+    // static constexpr index_t src_b_data_size          = 2;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+    // Wave mode dependent propety
+    static constexpr index_t wave_size = Number<WaveSize>{};
+    // static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
+    // static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+            intrin_wmma_f32_16x16x16_bf16_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+        }
+    }
+};
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma = 16;
+    static constexpr index_t n_per_wmma = 16;
+    static constexpr index_t k_per_wmma = 16;
+    // static constexpr index_t src_a_data_size          = 2;
+    // static constexpr index_t src_b_data_size          = 2;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+    // Wave mode dependent propety
+    static constexpr index_t wave_size = Number<WaveSize>{};
+    // static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
+    // static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+    template <index_t MPerWmma,
+              index_t NPerWmma,
+              class FloatA,
+              class FloatB,
+              class FloatC,
+              bool neg_a = false,
+              bool neg_b = false,
+              bool clamp = false>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+            intrin_wmma_i32_16x16x16_iu8_w32_gfx12<MPerWmma, NPerWmma, neg_a, neg_b, clamp>::Run(
+                a, b, reg_c);
+        }
+    }
+};
 template <typename src_type_a,
          typename src_type_b,
          typename dst_type,
@@ -349,7 +427,11 @@ struct WmmaSelector
    template <>
    static constexpr auto GetWmma<bhalf_t, bhalf_t, float, 16, 16>()
    {
+#ifdef __gfx12__
+        return WmmaInstr::wmma_f32_16x16x16_bf16_gfx12;
+#else
        return WmmaInstr::wmma_f32_16x16x16_bf16;
+#endif
    }
    template <>
@@ -367,8 +449,13 @@ struct WmmaSelector
    template <>
    static constexpr auto GetWmma<int8_t, int8_t, int, 16, 16>()
    {
+#ifdef __gfx12__
+        return WmmaInstr::wmma_i32_16x16x16_iu8_gfx12;
+#else
        return WmmaInstr::wmma_i32_16x16x16_iu8;
+#endif
    }
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
    template <>
    static constexpr auto GetWmma<int4_t, int4_t, int, 16, 16>()

--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -39,31 +39,6 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16>
    }
 };
-template <index_t MPerWave, index_t NPerWave>
-struct intrin_wmma_f32_16x16x16_f16_w32_gfx12;
-template <>
-struct intrin_wmma_f32_16x16x16_f16_w32_gfx12<16, 16>
-{
-    template <class FloatC>
-    __device__ static void Run(const half8_t& reg_a, const half8_t& reg_b, FloatC& reg_c)
-    {
-        // * Inline assembly need to elimate the duplicated data load, compiler won't help you
-        // delete them.
-        // amd_assembly_wmma_f32_16x16x16_f16_w32(
-        //     reg_a, reg_b, reg_c.template AsType<float8_t>()(Number<0>{}));
-#if defined(__gfx12__)
-        reg_c.template AsType<float8_t>()(Number<0>{}) =
-            __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(
-                reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
-#else
-        ignore = reg_a;
-        ignore = reg_b;
-        ignore = reg_c;
-#endif
-    }
-};
 // src: bf16, dst: fp32
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_wmma_f32_16x16x16_bf16_w32;
@@ -282,5 +257,87 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp>
    }
 };
+// gfx12
+/********************************WAVE32 MODE***********************************************/
+#if defined(__gfx1200__)
+#define __gfx12__
+#endif
+// src: fp16, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f16_w32_gfx12;
+template <>
+struct intrin_wmma_f32_16x16x16_f16_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const half8_t& reg_a, const half8_t& reg_b, FloatC& reg_c)
+    {
+        // * Inline assembly need to elimate the duplicated data load, compiler won't help you
+        // delete them.
+        // amd_assembly_wmma_f32_16x16x16_f16_w32(
+        //     reg_a, reg_b, reg_c.template AsType<float8_t>()(Number<0>{}));
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(
+                reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+// src: bf16, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf16_w32_gfx12;
+template <>
+struct intrin_wmma_f32_16x16x16_bf16_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf8_t& reg_a, const bhalf8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(
+                reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+// src: iu8, dst: i32
+template <index_t MPerWave, index_t NPerWave, bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w32_gfx12;
+template <bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w32_gfx12<16, 16, neg_a, neg_b, clamp>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<int32x8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
+                neg_a,
+                bit_cast<int32x2_t>(reg_a),
+                neg_b,
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<int32x8_t>()[Number<0>{}],
+                clamp);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
 } // namespace ck
 #endif