Tidy up + format

0a808724 · aska-0096 · 289f15de · 0a808724 · 0a808724 · 0a808724
Commit 0a808724 authored Dec 09, 2022 by aska-0096
4 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -38,8 +38,10 @@ __global__ void
            FloatC* __restrict__ p_c_grid,
            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock,
-            // const CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            // const
+            // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup
            //     c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup,
            const AElementwiseOperation a_element_op,
            const BElementwiseOperation b_element_op,
@@ -49,8 +51,7 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

-    GridwiseGemm::template Run<HasMainKBlockLoop>(
-        p_a_grid,
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
                                                  p_b_grid,
                                                  p_c_grid,
                                                  p_shared,
@@ -75,8 +76,7 @@ __global__ void
 #endif // end of if (defined(__gfx1100__))
 }

-template <
-    index_t BlockSize,
+template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatCShuffle,
@@ -202,17 +202,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+        constexpr auto a_block_desc_k0perblock_mperblock_k1 =
+            GetABlockDescriptor_K0PerBlock_MPerBlock_K1();

-        constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+        constexpr auto b_block_desc_k0perblock_nperblock_k1 =
+            GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();

        constexpr auto max_lds_align = K1;

-        constexpr auto a_block_space_size_aligned =
-            math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align);
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align);

-        constexpr auto b_block_space_size_aligned =
-            math::integer_least_multiple(b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align);
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align);

        return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB);
    }
@@ -308,7 +310,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
        constexpr auto WmmaK = 16;
        constexpr auto KPack = math::integer_least_multiple(K1, WmmaK);

-        using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3<BlockSize,
+        using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle<
+            BlockSize,
            FloatAB,
            FloatAcc,
            decltype(a_block_desc_k0perblock_mperblock_k1),
@@ -319,7 +322,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
            NRepeat,
            KPack>;

-        return BlockwiseGemm::MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_grid_desc_m_n);
+        return BlockwiseGemm::
+            MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_grid_desc_m_n);
    }

    // Per pixel
@@ -362,7 +367,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
        constexpr auto WmmaK = 16;
        constexpr auto KPack = math::integer_least_multiple(K1, WmmaK);

-        using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3<BlockSize,
+        using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle<
+            BlockSize,
            FloatAB,
            FloatAcc,
            decltype(a_block_desc_k0perblock_mperblock_k1),
@@ -373,7 +379,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
            NRepeat,
            KPack>;

-        return BlockwiseGemm::MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(c_grid_desc_m_n);
+        return BlockwiseGemm::
+            MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(
+                c_grid_desc_m_n);
    }

    __host__ __device__ static constexpr auto
@@ -402,11 +410,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
            c_grid_desc_m_n);
    }
-    // using CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup = remove_cvref_t<decltype(
+    // using
+    // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup
+    // = remove_cvref_t<decltype(
    //         MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(
    //             CGridDesc_M_N{}))>;
-    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = 
-        remove_cvref_t<decltype(MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
    using DefaultBlock2CTileMap =
        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;

@@ -420,14 +430,15 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
        const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
            c_grid_desc_mblock_mperblock_nblock_nperblock,
-        // const CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup&
+        // const
+        // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup&
        // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup,
        const AElementwiseOperation& a_element_op,
        const BElementwiseOperation& b_element_op,
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
-// clang-format off
+        // clang-format off
 /*******************************************************************************/
 // Memory buffer zone.
        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -453,12 +464,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
 /*******************************************************************************/
 // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy
        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
-        // printf("K0 = %d, M = %d, K1 = %d\n", K0, a_grid_desc_k0_m_k1.GetLength(I1), (a_grid_desc_k0_m_k1.GetLength(I2))());
        constexpr auto max_lds_align = K1;
        constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
        constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
-        // printf("blockdesc: K0 = %d, M = %d, K1 = %d\n", (a_block_desc_k0perblock_mperblock_k1.GetLength(I0))(), 
-                    // (a_block_desc_k0perblock_mperblock_k1.GetLength(I1))(), (a_block_desc_k0perblock_mperblock_k1.GetLength(I2))());
        // A matrix blockwise copy
        auto a_blockwise_copy =
            ThreadGroupTensorSliceTransfer_v4r1<        ThisThreadBlock,
@@ -532,7 +540,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
        constexpr auto KPack = math::integer_least_multiple(K1, WmmaK);

        auto blockwise_gemm =
-            BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3<BlockSize,
+            BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle<BlockSize,
                                                         FloatAB,
                                                         FloatAcc,
                                                         decltype(a_block_desc_k0perblock_mperblock_k1),
@@ -838,12 +846,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
                if constexpr(access_id < num_access - 1)
                {
                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
-                    // CONFIRMED
-                    // printf("c_global_step = (%d, %d, %d, %d)\n", 
-                                            // c_global_step[Number<0>{}], 
-                                            // c_global_step[Number<1>{}], 
-                                            // c_global_step[Number<2>{}], 
-                                            // c_global_step[Number<3>{}]);
                    // move on C
                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);

--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -12,11 +12,11 @@ namespace ck {
 enum struct WmmaInstr
 {
    wmma_f32_16x16x16_f16 = 0,
-    wmma_f32_16x16x16_bf16 = 0,
-    wmma_f16_16x16x16_f16 = 0,
-    wmma_bf16_16x16x16_bf16 = 0,
-    wmma_i32_16x16x16_iu8 = 0,
-    wmma_i32_16x16x16_iu4 = 0
+    wmma_f32_16x16x16_bf16,
+    wmma_f16_16x16x16_f16,
+    wmma_bf16_16x16x16_bf16,
+    wmma_i32_16x16x16_iu8,
+    wmma_i32_16x16x16_iu4
 };

 /*
@@ -70,18 +70,18 @@ enum struct WmmaInstr
 *	T  = Thread ID
 */

-template <WmmaInstr Instr, 
-          index_t WaveSize,
-          typename = void>
-struct wmma_type{};
+template <WmmaInstr Instr, index_t WaveSize, typename = void>
+struct wmma_type
+{
+};

 // A-swizzled
 template <index_t WaveSize>
 struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16,
                 WaveSize,
-                 typename std::enable_if_t<WaveSize == 32 ||WaveSize == 64>>
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
 {
-// Absolute fixing property
+    // Absolute fixing property
    // * Data Pixel
    static constexpr index_t m_per_wmma      = 16;
    static constexpr index_t n_per_wmma      = 16;
@@ -92,14 +92,15 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16,
    // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction
    static constexpr index_t num_thread_per_subgroups = n_per_wmma;

-// Wave mode dependent propety
+    // Wave mode dependent propety
    static constexpr index_t wave_size = Number<WaveSize>{};
    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
    static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
    static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
    // * num_acc_vgprs_per_wave alone M direction
    // * num_subgroups alone M direction
-    static constexpr index_t num_acc_vgprs_per_wave   = m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4;
+    static constexpr index_t num_acc_vgprs_per_wave =
+        m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4;
    static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;

    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
@@ -116,6 +117,172 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16,
    }
 };

+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf16,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t src_a_data_size          = 2;
+    static constexpr index_t src_b_data_size          = 2;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size                = Number<WaveSize>{};
+    static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
+    static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
+    static constexpr index_t num_acc_vgprs_per_wave =
+        m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4;
+    static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        if constexpr(wave_size == 32)
+        {
+            intrin_wmma_f32_16x16x16_bf16_w32<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+        }
+        else if constexpr(wave_size == 64)
+        {
+            intrin_wmma_f32_16x16x16_bf16_w64<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+        }
+    }
+};
+
+#ifdef CK_UNPACKED_ACC_DESC_LOGIC
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f16_16x16x16_f16,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t src_a_data_size          = 2;
+    static constexpr index_t src_b_data_size          = 2;
+    static constexpr index_t acc_data_size            = 2;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size                = Number<WaveSize>{};
+    static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
+    static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
+    static constexpr index_t num_acc_vgprs_per_wave =
+        m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4;
+    static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma,
+              index_t NPerWmma,
+              index_t Opsel,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        if constexpr(wave_size == 32)
+        {
+            intrin_wmma_f16_16x16x16_f16_w32<MPerWmma, NPerWmma, Opsel>::Run(a, b, reg_c);
+        }
+        else if constexpr(wave_size == 64)
+        {
+            intrin_wmma_f16_16x16x16_f16_w64<MPerWmma, NPerWmma, Opsel>::Run(a, b, reg_c);
+        }
+    }
+};
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_bf16_16x16x16_bf16,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t src_a_data_size          = 2;
+    static constexpr index_t src_b_data_size          = 2;
+    static constexpr index_t acc_data_size            = 2;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size                = Number<WaveSize>{};
+    static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
+    static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
+    static constexpr index_t num_acc_vgprs_per_wave =
+        m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4;
+    static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma,
+              index_t NPerWmma,
+              index_t Opsel,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        if constexpr(wave_size == 32)
+        {
+            intrin_wmma_bf16_16x16x16_bf16_w32<MPerWmma, NPerWmma, Opsel>::Run(a, b, reg_c);
+        }
+        else if constexpr(wave_size == 64)
+        {
+            intrin_wmma_bf16_16x16x16_bf16_w64<MPerWmma, NPerWmma, Opsel>::Run(a, b, reg_c);
+        }
+    }
+};
+
+#endif
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t src_a_data_size          = 2;
+    static constexpr index_t src_b_data_size          = 2;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size                = Number<WaveSize>{};
+    static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
+    static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
+    static constexpr index_t num_acc_vgprs_per_wave =
+        m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4;
+    static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma,
+              index_t NPerWmma,
+              bool neg_a,
+              bool neg_b,
+              bool clamp,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        if constexpr(wave_size == 32)
+        {
+            intrin_wmma_i32_16x16x16_iu8_w32<MPerWmma, NPerWmma, neg_a, neg_b, clamp>::Run(
+                a, b, reg_c);
+        }
+        else if constexpr(wave_size == 64)
+        {
+            intrin_wmma_i32_16x16x16_iu8_w64<MPerWmma, NPerWmma, neg_a, neg_b, clamp>::Run(
+                a, b, reg_c);
+        }
+    }
+};
+
 template <typename src_type, typename dst_type, index_t MPerWmma, index_t NPerWmma>
 struct WmmaSelector
 {
@@ -159,20 +326,19 @@ struct WmmaSelector
    }
 #endif
    // get_warp_size do not return the correct wavesize, hardcode to 32 as workaround
-    static constexpr auto selected_wmma = wmma_type<GetWmma<src_type, dst_type, MPerWmma, NPerWmma>(), Number<32>{}>{};
+    static constexpr auto selected_wmma =
+        wmma_type<GetWmma<src_type, dst_type, MPerWmma, NPerWmma>(), Number<32>{}>{};

    __host__ __device__ constexpr WmmaSelector()
    {
-        static_assert(selected_wmma.m_per_wmma == 16,
-                      "WRONG! WMMA_M must equal to 16");
+        static_assert(selected_wmma.m_per_wmma == 16, "WRONG! WMMA_M must equal to 16");

-        static_assert(selected_wmma.m_per_wmma == 16,
-                      "WRONG! WMMA_M must equal to 16");
+        static_assert(selected_wmma.m_per_wmma == 16, "WRONG! WMMA_M must equal to 16");

-        static_assert(selected_wmma.k_per_wmma == 16,
-                      "WRONG! WMMA_M must equal to 16");
+        static_assert(selected_wmma.k_per_wmma == 16, "WRONG! WMMA_M must equal to 16");

-        static_assert(selected_wmma.wave_size * selected_wmma.num_acc_vgprs_per_wave * selected_wmma.acc_data_size==
+        static_assert(selected_wmma.wave_size * selected_wmma.num_acc_vgprs_per_wave *
+                              selected_wmma.acc_data_size ==
                          selected_wmma.m_per_wmma * selected_wmma.n_per_wmma * 4,
                      "WRONG! Invalid Number of Accumulator Register");
    }
@@ -198,7 +364,7 @@ struct WmmaGemm

    __host__ __device__ constexpr WmmaGemm()
    {
-        static_assert(NPerWmma == 16 && MPerWmma == 16 ,
+        static_assert(NPerWmma == 16 && MPerWmma == 16,
                      "Only support GemmNPerWmma == 16 and GemmMPerWmma == 16 for wmma");

        static_assert(KPack == wmma_instr.k_per_wmma, "KPack should be k_per_wmma");
@@ -209,17 +375,23 @@ struct WmmaGemm
    // MPerWMMA_NPerWMMA -> MSubGroup_..._NPerWMMA_MAccVgprPerWave
    template <typename CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA>
    __host__ __device__ static constexpr auto
-    MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs
-    (const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma)
+    MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+        const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA&
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma)
    {
-        const auto MBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0);
-        const auto NBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3);
-        const auto MWave   = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1);
-        const auto NWave   = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4);
+        const auto MBlockxRepeat =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0);
+        const auto NBlockxRepeat =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3);
+        const auto MWave =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1);
+        const auto NWave =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4);

        return transform_tensor_descriptor(
            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma,
-            make_tuple(make_pass_through_transform(MBlockxRepeat),
+            make_tuple(
+                make_pass_through_transform(MBlockxRepeat),
                make_pass_through_transform(MWave),
                make_unmerge_transform(make_tuple(Number<wmma_instr.num_subgroups>{},
                                                  Number<wmma_instr.num_acc_vgprs_per_wave>{})),
@@ -243,17 +415,23 @@ struct WmmaGemm
    // Per-Pixel write
    template <typename CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA>
    __host__ __device__ static constexpr auto
-    MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup
-    (const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma)
+    MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(
+        const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA&
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma)
    {
-        const auto MBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0);
-        const auto NBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3);
-        const auto MWave   = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1);
-        const auto NWave   = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4);
+        const auto MBlockxRepeat =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0);
+        const auto NBlockxRepeat =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3);
+        const auto MWave =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1);
+        const auto NWave =
+            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4);

        return transform_tensor_descriptor(
            c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma,
-            make_tuple(make_pass_through_transform(MBlockxRepeat),
+            make_tuple(
+                make_pass_through_transform(MBlockxRepeat),
                make_pass_through_transform(MWave),
                make_unmerge_transform(make_tuple(Number<wmma_instr.num_subgroups>{},
                                                  Number<wmma_instr.num_acc_vgprs_per_wave>{})),
@@ -279,15 +457,13 @@ struct WmmaGemm
        return wmma_instr.num_acc_vgprs_per_wave;
    }

-    __device__ static constexpr index_t GetWaveSize() 
-    { 
-        return wmma_instr.wave_size; 
-    }
+    __device__ static constexpr index_t GetWaveSize() { return wmma_instr.wave_size; }

    template <class FloatA, class FloatB, class FloatC>
    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
    {
-        static_assert((is_same<src_type, half_t>::value && is_same<dst_type, float>::value) || 
+        static_assert(
+            (is_same<src_type, half_t>::value && is_same<dst_type, float>::value) ||
                (is_same<src_type, bhalf_t>::value && is_same<dst_type, float>::value) ||
                (is_same<src_type, half_t>::value && is_same<dst_type, half_t>::value) ||
                (is_same<src_type, bhalf_t>::value && is_same<dst_type, bhalf_t>::value) ||
@@ -295,23 +471,20 @@ struct WmmaGemm
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
                || (is_same<src_type, int4_t>::value && is_same<dst_type, int32_t>::value)
 #endif
-                      ,"base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), (int8, int32) or (int4, int32)!");
+                ,
+            "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), "
+            "(int8, int32) or (int4, int32)!");
        if constexpr(!TransposeC)
        {
-            wmma_instr.template run<MPerWmma, NPerWmma>(
-                p_a_wave, p_b_wave, p_c_thread);
+            wmma_instr.template run<MPerWmma, NPerWmma>(p_a_wave, p_b_wave, p_c_thread);
        }
        else
        {
-            wmma_instr.template run<MPerWmma, NPerWmma>(
-                p_b_wave, p_a_wave, p_c_thread);
+            wmma_instr.template run<MPerWmma, NPerWmma>(p_b_wave, p_a_wave, p_c_thread);
        }
    }

-    __device__ static auto GetLaneId() 
-    { 
-        return get_thread_local_1d_id() % wmma_instr.wave_size; 
-    }
+    __device__ static auto GetLaneId() { return get_thread_local_1d_id() % wmma_instr.wave_size; }

    __device__ static auto GetSubGroupId()
    {
@@ -324,7 +497,7 @@ struct WmmaGemm
    }
    __device__ static auto GetSwizzledLaneIdLow()
    {
-        return ((GetLaneIdUnderSubGroup() & 1) << 3 ) | (GetLaneIdUnderSubGroup() >> 1);
+        return ((GetLaneIdUnderSubGroup() & 1) << 3) | (GetLaneIdUnderSubGroup() >> 1);
    }

    __host__ __device__ static auto CalculateAThreadOriginDataIndex()
@@ -348,10 +521,10 @@ struct WmmaGemm
    static constexpr auto wmma       = WmmaSelector<src_type, dst_type, MPerWmma, NPerWmma>{};
    static constexpr auto wmma_instr = wmma.selected_wmma;

-    __host__ __device__ static constexpr auto GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths()
+    __host__ __device__ static constexpr auto
+    GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths()
    {
-        return make_tuple(
-            I1, I1, Number<wmma_instr.num_acc_vgprs_per_wave>{});
+        return make_tuple(I1, I1, Number<wmma_instr.num_acc_vgprs_per_wave>{});
    }
 };


--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -8,6 +8,8 @@
 // TODO: Add arch limitation
 namespace ck {

+/********************************WAVE32 MODE***********************************************/
+
 // src: fp16, dst: fp32
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_wmma_f32_16x16x16_f16_w32;
@@ -23,20 +25,6 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16>
    }
 };

-template <index_t MPerWave, index_t NPerWave>
-struct intrin_wmma_f32_16x16x16_f16_w64;
-
-template <>
-struct intrin_wmma_f32_16x16x16_f16_w64<16, 16>
-{
-    template <class FloatC>
-    __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c)
-    {
-        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(
-            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}]);
-    }
-};
-
 // src: bf16, dst: fp32
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_wmma_f32_16x16x16_bf16_w32;
@@ -111,5 +99,95 @@ struct intrin_wmma_i32_16x16x16_iu8_w32<16, 16, neg_a, neg_b, clamp>
    }
 };

+/********************************WAVE64 MODE***********************************************/
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f16_w64;
+
+template <>
+struct intrin_wmma_f32_16x16x16_f16_w64<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}]);
+    }
+};
+
+// src: bf16, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf16_w64;
+
+template <>
+struct intrin_wmma_f32_16x16x16_bf16_w64<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(
+                reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}]);
+    }
+};
+
+// src: fp16, dst: fp16
+template <index_t MPerWave, index_t NPerWave, index_t Opsel>
+struct intrin_wmma_f16_16x16x16_f16_w64;
+
+template <index_t Opsel>
+struct intrin_wmma_f16_16x16x16_f16_w64<16, 16, Opsel>
+{
+    template <class FloatC>
+    __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c)
+    {
+        // opsel usage
+        // false: D0.[0:15] = result
+        // true : D0.[16:31]= result
+        reg_c.template AsType<half8_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(
+            reg_a, reg_b, reg_c.template AsType<half8_t>()[Number<0>{}], Opsel);
+    }
+};
+
+// src: bf16, dst: bf16
+template <index_t MPerWave, index_t NPerWave, index_t Opsel>
+struct intrin_wmma_bf16_16x16x16_bf16_w64;
+
+template <index_t Opsel>
+struct intrin_wmma_bf16_16x16x16_bf16_w64<16, 16, Opsel>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c)
+    {
+        // opsel usage
+        // false: D0.[0:15] = result
+        // true : D0.[16:31]= result
+        reg_c.template AsType<bhalf8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+                reg_a, reg_b, reg_c.template AsType<bhalf8_t>()[Number<0>{}], Opsel);
+    }
+};
+
+// src: iu8, dst: i32
+template <index_t MPerWave, index_t NPerWave, bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w64;
+
+template <bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x16_t& reg_a, const int8x16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x4_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(
+                neg_a,
+                bit_cast<int32x4_t>(reg_a),
+                neg_b,
+                bit_cast<int32x4_t>(reg_b),
+                reg_c.template AsType<int32x4_t>()[Number<0>{}],
+                clamp);
+    }
+};
+
 } // namespace ck
 #endif