Merge remote-tracking branch 'origin/develop' into aosewski/gemm_tile_loop

271269a5 · Adam Osewski · 648f1f13 · 04f93aad · 271269a5 · 271269a5
Commit 271269a5 authored Oct 05, 2023 by Adam Osewski
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -139,7 +139,8 @@ __host__ __device__ constexpr auto make_merge_transform_v4_no_carry(const LowLen
 }
 template <typename GridwiseGemm,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatC,
          typename AGridDesc_B_K0_M_K1,
          typename BGridDesc_B_K0_N_K1,
@@ -153,8 +154,8 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_bwd_weight(const FloatAB* __restrict__ p_a_grid,
+        kernel_gemm_xdlops_bwd_weight(const FloatA* __restrict__ p_a_grid,
-                                      const FloatAB* __restrict__ p_b_grid,
+                                      const FloatB* __restrict__ p_b_grid,
                                      FloatC* __restrict__ p_c_grid,
                                      const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
                                      const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
@@ -181,21 +182,22 @@ __global__ void
                                                  c_element_op,
                                                  c_block_cluster_adaptor);
 #else
-    ignore                = p_a_grid;
+    ignore               = p_a_grid;
-    ignore                = p_b_grid;
+    ignore               = p_b_grid;
-    ignore                = p_c_grid;
+    ignore               = p_c_grid;
-    ignore                = a_b_k0_m_k1_grid_desc;
+    ignore               = a_b_k0_m_k1_grid_desc;
-    ignore                = b_b_k0_n_k1_grid_desc;
+    ignore               = b_b_k0_n_k1_grid_desc;
-    ignore                = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore               = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore                = a_element_op;
+    ignore               = a_element_op;
-    ignore                = b_element_op;
+    ignore               = b_element_op;
-    ignore                = c_element_op;
+    ignore               = c_element_op;
-    ignore                = c_block_cluster_adaptor;
+    ignore               = c_block_cluster_adaptor;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename FloatC,
          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
@@ -242,7 +244,9 @@ template <index_t BlockSize,
          bool ABlockLdsExtraM1Wrw      = false,
          bool BBlockLdsExtraN1Wrw      = false,
          index_t NumGemmKPrefetchStage = 1,
-          PipelineVersion PipelineVer   = PipelineVersion::v1>
+          PipelineVersion PipelineVer   = PipelineVersion::v1,
+          typename ComputeTypeA         = FloatA,
+          typename ComputeTypeB         = ComputeTypeA>
 struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
 {
    static constexpr auto I0 = Number<0>{};
@@ -265,11 +269,16 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    // denorm test fix, required to work around fp16 mfma issue
    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
    // when mfma if fixed, remove this section and update
-    // FloatABAdjusted -> FloatAB throughout this file
+    // FloatAAdjusted -> ComputeTypeA, FloatBAdjusted -> ComputeTypeB,
+    // throughout this file
 #if CK_WORKAROUND_DENORM_FIX
-    using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
+    using FloatAAdjusted =
+        conditional_t<is_same_v<ComputeTypeA, ck::half_t>, ck::bhalf_t, ComputeTypeA>;
+    using FloatBAdjusted =
+        conditional_t<is_same_v<ComputeTypeB, ck::half_t>, ck::bhalf_t, ComputeTypeB>;
 #else
-    using FloatABAdjusted = FloatAB;
+    using FloatAAdjusted = ComputeTypeA;
+    using FloatBAdjusted = ComputeTypeB;
 #endif
    // M0/M1/M1Padding
@@ -506,7 +515,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        constexpr auto c_block_size =
            GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock().GetElementSpaceSize();
-        return math::max((a_block_space_size + b_block_space_size) * sizeof(FloatAB),
+        return math::max((a_block_space_size * sizeof(FloatAAdjusted) +
+                          b_block_space_size * sizeof(FloatBAdjusted)),
                         c_block_size * sizeof(FloatC));
    }
@@ -610,8 +620,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+    __device__ static void Run(const FloatA* __restrict__ p_a_grid,
-                               const FloatAB* __restrict__ p_b_grid,
+                               const FloatB* __restrict__ p_b_grid,
                               FloatC* __restrict__ p_c_grid,
                               void* __restrict__ p_shared,
                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
@@ -673,8 +683,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                Sequence<1, K0PerBlock, MPerBlock, K1>,
                                                ABlockTransferThreadClusterLengths_K0_M_K1,
                                                ABlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
+                                                FloatA,
-                                                FloatABAdjusted,
+                                                FloatAAdjusted,
                                                decltype(a_b_k0_m_k1_grid_desc),
                                                decltype(a_b_k0_m_k1_block_desc),
                                                ABlockTransferSrcAccessOrder,
@@ -703,8 +713,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                Sequence<1, K0PerBlock, NPerBlock, K1>,
                                                BBlockTransferThreadClusterLengths_K0_N_K1,
                                                BBlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
+                                                FloatB,
-                                                FloatABAdjusted,
+                                                FloatBAdjusted,
                                                decltype(b_b_k0_n_k1_grid_desc),
                                                decltype(b_b_k0_n_k1_block_desc),
                                                BBlockTransferSrcAccessOrder,
@@ -733,11 +743,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        // sanity check
        constexpr index_t KPack =
-            math::max(K1, MfmaSelector<FloatABAdjusted, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
+            math::max(K1,
+                      MfmaSelector<FloatAAdjusted, MPerXDL, NPerXDL, FloatBAdjusted>::selected_mfma
+                          .k_per_blk);
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatABAdjusted,
+                                                                FloatAAdjusted,
+                                                                FloatBAdjusted,
                                                                FloatAcc,
                                                                decltype(a_k0_m_k1_block_desc),
                                                                decltype(b_k0_n_k1_block_desc),
@@ -757,10 +770,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatABAdjusted*>(p_shared), a_k0_m_k1_block_desc.GetElementSpaceSize());
+            static_cast<FloatAAdjusted*>(p_shared), a_k0_m_k1_block_desc.GetElementSpaceSize());
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatABAdjusted*>(p_shared) + a_block_space_size,
+            static_cast<FloatBAdjusted*>(p_shared) + a_block_space_size,
            b_k0_n_k1_block_desc.GetElementSpaceSize());
        // gridwise GEMM pipeline

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
@@ -490,6 +490,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -424,6 +424,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        using BlockwiseGemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatABAdjusted,
                                                                FloatABAdjusted,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),
@@ -569,6 +570,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
            FloatABAdjusted,
+            FloatABAdjusted,
            FloatAcc,
            decltype(a_block_desc_k0_m_k1),
            decltype(b_block_desc_k0_n_k1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -762,6 +762,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
            ComputeType,
+            ComputeType,
            FloatAcc,
            decltype(a_k0_m_k1_block_desc),
            decltype(b_k0_n_k1_block_desc),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -451,6 +451,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_ak0_m_ak1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -471,6 +471,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -489,6 +489,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_k0_m_k1),

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -31,7 +31,13 @@ enum struct MfmaInstr
    mfma_i32_16x16x32i8,
    mfma_f64_16x16x4f64,
    mfma_f32_32x32x16f8f8,
-    mfma_f32_16x16x32f8f8
+    mfma_f32_16x16x32f8f8,
+    mfma_f32_32x32x16bf8bf8,
+    mfma_f32_16x16x32bf8bf8,
+    mfma_f32_32x32x16f8bf8,
+    mfma_f32_16x16x32f8bf8,
+    mfma_f32_32x32x16bf8f8,
+    mfma_f32_16x16x32bf8f8
 };
 template <MfmaInstr instr>
@@ -502,10 +508,154 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8f8>
 };
 #endif
-template <typename base_type, index_t MPerXdlops, index_t NPerXdlops>
+#if defined CK_ENABLE_BF8
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16bf8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32bf8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16f8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32f8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8f8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16bf8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32bf8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+#endif
+template <typename base_type,
+          index_t MPerXdlops,
+          index_t NPerXdlops,
+          typename additional_type = base_type>
 struct MfmaSelector
 {
-    template <typename base_type_, index_t MPerXdlops_, index_t NPerXdlops_>
+    template <typename base_type_,
+              index_t MPerXdlops_,
+              index_t NPerXdlops_,
+              typename additional_type_ = base_type_>
    static constexpr auto GetMfma();
    template <>
@@ -656,7 +806,50 @@ struct MfmaSelector
    }
 #endif
-    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops>()>{};
+#if defined CK_ENABLE_BF8
+    template <>
+    static constexpr auto GetMfma<bf8_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16bf8bf8;
+    }
+    template <>
+    static constexpr auto GetMfma<bf8_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_f32_16x16x32bf8bf8;
+    }
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+    template <>
+    static constexpr auto GetMfma<f8_t, 32, 32, bf8_t>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16f8bf8;
+    }
+    template <>
+    static constexpr auto GetMfma<f8_t, 16, 16, bf8_t>()
+    {
+        return MfmaInstr::mfma_f32_16x16x32f8bf8;
+    }
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+    template <>
+    static constexpr auto GetMfma<bf8_t, 32, 32, f8_t>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16bf8f8;
+    }
+    template <>
+    static constexpr auto GetMfma<bf8_t, 16, 16, f8_t>()
+    {
+        return MfmaInstr::mfma_f32_16x16x32bf8f8;
+    }
+#endif
+    static constexpr auto selected_mfma =
+        mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops, additional_type>()>{};
    __host__ __device__ constexpr MfmaSelector()
    {
@@ -703,7 +896,8 @@ template <typename base_type,
          index_t MPerXdlops,
          index_t NPerXdlops,
          index_t KPack,
-          bool TransposeC = false>
+          typename additional_type = base_type,
+          bool TransposeC          = false>
 struct XdlopsGemm
 {
    static constexpr auto I0 = Number<0>{};
@@ -854,14 +1048,22 @@ struct XdlopsGemm
    template <class FloatA, class FloatB, class FloatC>
    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
    {
-        static_assert(is_same<base_type, double>::value || is_same<base_type, float>::value ||
+        static_assert(
-                          is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
+            is_same<base_type, double>::value || is_same<base_type, float>::value ||
-                          is_same<base_type, int8_t>::value
+                is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
+                is_same<base_type, int8_t>::value
 #if defined CK_ENABLE_FP8
-                          || is_same<base_type, f8_t>::value
+                || is_same<base_type, f8_t>::value
+#endif
+#if defined CK_ENABLE_BF8
+                || is_same<base_type, bf8_t>::value
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+                || (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value) ||
+                (is_same<base_type, bf8_t>::value && is_same<additional_type, f8_t>::value)
 #endif
-                      ,
+                ,
-                      "base base_type must be double, float, half, bfloat16, and int8_t!");
+            "base base_type must be double, float, half, bfloat16, int8_t, f8_t or bf8_t!");
        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
            if constexpr(!TransposeC)
@@ -957,7 +1159,7 @@ struct XdlopsGemm
        return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td};
    }
-    static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops>{};
+    static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops, additional_type>{};
    static constexpr auto mfma_instr = mfma.selected_mfma;

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -1127,37 +1127,53 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
-#if defined CK_ENABLE_FP8
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-    if constexpr(is_same<scalar_t, f8_t>::value)
+    if constexpr(is_same<scalar_t, f8_t>::value || is_same<scalar_t, bf8_t>::value)
-    {
-        auto tmp = amd_buffer_load_impl<int8_t, vector_size, coherence>(
-            src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
-        return bit_cast<vector_t>(tmp);
-    }
-    else
-    {
 #endif
-        return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+#if defined CK_ENABLE_FP8 && !defined CK_ENABLE_BF8
-            src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+        if constexpr(is_same<scalar_t, f8_t>::value)
-#if defined CK_ENABLE_FP8
+#endif
-    }
+#if !defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+            if constexpr(is_same<scalar_t, bf8_t>::value)
+#endif
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+            {
+                auto tmp = amd_buffer_load_impl<int8_t, vector_size, coherence>(
+                    src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+                return bit_cast<vector_t>(tmp);
+            }
+            else
+            {
+#endif
+                return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+                    src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+            }
 #endif
 #else
-#if defined CK_ENABLE_FP8
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-    if constexpr(is_same<scalar_t, f8_t>::value)
+    if constexpr(is_same<scalar_t, f8_t>::value || is_same<scalar_t, bf8_t>::value)
-    {
-        auto tmp = amd_buffer_load_impl<int8_t, vector_size, coherence>(
-            src_wave_buffer_resource, src_thread_addr_offset, 0);
-        return src_thread_element_valid ? bit_cast<vector_t>(tmp) : vector_t(0);
-    }
-    else
-    {
 #endif
-        vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+#if defined CK_ENABLE_FP8 && !defined CK_ENABLE_BF8
-            src_wave_buffer_resource, src_thread_addr_offset, 0);
+        if constexpr(is_same<scalar_t, f8_t>::value)
-        return src_thread_element_valid ? tmp : vector_t(0);
+#endif
-#if defined CK_ENABLE_FP8
+#if !defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-    }
+            if constexpr(is_same<scalar_t, bf8_t>::value)
+#endif
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+            {
+                auto tmp = amd_buffer_load_impl<int8_t, vector_size, coherence>(
+                    src_wave_buffer_resource, src_thread_addr_offset, 0);
+                return src_thread_element_valid ? bit_cast<vector_t>(tmp) : vector_t(0);
+            }
+            else
+            {
+#endif
+                vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+                    src_wave_buffer_resource, src_thread_addr_offset, 0);
+                return src_thread_element_valid ? tmp : vector_t(0);
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+            }
 #endif
 #endif
 }
@@ -1216,40 +1232,61 @@ __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::t
 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
-#if defined CK_ENABLE_FP8
-    if constexpr(is_same<scalar_t, f8_t>::value)
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-    {
+    if constexpr(is_same<scalar_t, f8_t>::value || is_same<scalar_t, bf8_t>::value)
-        auto tmp =
-            bit_cast<typename vector_type_maker<int8_t, vector_size>::type::type>(src_thread_data);
-        amd_buffer_store_impl<int8_t, vector_size, coherence>(
-            tmp, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
-    }
-    else
-    {
 #endif
-        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+#if defined CK_ENABLE_FP8 && !defined CK_ENABLE_BF8
-            src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+        if constexpr(is_same<scalar_t, f8_t>::value)
-#if defined CK_ENABLE_FP8
+#endif
-    }
+#if !defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+            if constexpr(is_same<scalar_t, bf8_t>::value)
+#endif
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+            {
+                auto tmp = bit_cast<typename vector_type_maker<int8_t, vector_size>::type::type>(
+                    src_thread_data);
+                amd_buffer_store_impl<int8_t, vector_size, coherence>(
+                    tmp, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+            }
+            else
+            {
+#endif
+                amd_buffer_store_impl<scalar_t, vector_size, coherence>(src_thread_data,
+                                                                        dst_wave_buffer_resource,
+                                                                        dst_addr_shift +
+                                                                            dst_thread_addr_offset,
+                                                                        0);
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+            }
 #endif
 #else
    if(dst_thread_element_valid)
    {
-#if defined CK_ENABLE_FP8
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-        if constexpr(is_same<scalar_t, f8_t>::value)
+        if constexpr(is_same<scalar_t, f8_t>::value || is_same<scalar_t, bf8_t>::value)
-        {
-            auto tmp = bit_cast<typename vector_type_maker<int8_t, vector_size>::type::type>(
-                src_thread_data);
-            amd_buffer_store_impl<int8_t, vector_size, coherence>(
-                tmp, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
-        }
-        else
-        {
 #endif
-            amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+#if defined CK_ENABLE_FP8 && !defined CK_ENABLE_BF8
-                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+            if constexpr(is_same<scalar_t, f8_t>::value)
-#if defined CK_ENABLE_FP8
+#endif
-        }
+#if !defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+                if constexpr(is_same<scalar_t, bf8_t>::value)
+#endif
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+                {
+                    auto tmp =
+                        bit_cast<typename vector_type_maker<int8_t, vector_size>::type::type>(
+                            src_thread_data);
+                    amd_buffer_store_impl<int8_t, vector_size, coherence>(
+                        tmp, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+                }
+                else
+                {
+#endif
+                    amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+                        src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+#if defined CK_ENABLE_FP8 || defined CK_ENABLE_BF8
+                }
 #endif
    }
 #endif

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -419,5 +419,200 @@ struct intrin_mfma_f32_16x16x32f8f8<16, 16>
    }
 };
 #endif
+#if defined CK_ENABLE_BF8
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16bf8bf8;
+template <>
+struct intrin_mfma_f32_32x32x16bf8bf8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<bf8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<bf8_t>()[Number<k>{}]);
+            intrin_mfma_f32_32x32x2f32<32, 32>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32bf8bf8;
+template <>
+struct intrin_mfma_f32_16x16x32bf8bf8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float4_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<bf8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<bf8_t>()[Number<k>{}]);
+            intrin_mfma_f32_16x16x4f32<16, 16>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16f8bf8;
+template <>
+struct intrin_mfma_f32_32x32x16f8bf8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+#else
+        vector_type<f8_t, 8> reg_a_v(reg_a);
+        vector_type<bf8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<f8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<bf8_t>()[Number<k>{}]);
+            intrin_mfma_f32_32x32x2f32<32, 32>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32f8bf8;
+template <>
+struct intrin_mfma_f32_16x16x32f8bf8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float4_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+#else
+        vector_type<f8_t, 8> reg_a_v(reg_a);
+        vector_type<bf8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<f8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<bf8_t>()[Number<k>{}]);
+            intrin_mfma_f32_16x16x4f32<16, 16>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+#endif
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16bf8f8;
+template <>
+struct intrin_mfma_f32_32x32x16bf8f8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<f8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<f8_t>()[Number<k>{}]);
+            intrin_mfma_f32_32x32x2f32<32, 32>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32bf8f8;
+template <>
+struct intrin_mfma_f32_16x16x32bf8f8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float4_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<f8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<f8_t>()[Number<k>{}]);
+            intrin_mfma_f32_16x16x4f32<16, 16>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+#endif
 } // namespace ck
 #endif
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -221,7 +221,7 @@ inline __host__ __device__ bf8_t type_convert<bf8_t, half_t>(half_t x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
    // convert to float and use native converion
-    return type_convert<f8_t>(type_convert<float>(x));
+    return type_convert<bf8_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
@@ -23,6 +23,7 @@ template <ck::index_t NumDimM,
          typename BDataType,
          typename CDataType,
          typename AccDataType,
+          typename ComputeDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
@@ -69,19 +70,24 @@ struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::Base
                {
                    for(ck::index_t k1 = 0; k1 < K1; ++k1)
                    {
+                        // Simulate the possible casting when ComputeDataType is different than the
+                        // A/B data types
+                        ComputeDataType v_a_compute_input =
+                            ck::type_convert<ComputeDataType>(arg.a_ms_ks_(m0, m1, k0, k1));
+                        ComputeDataType v_b_compute_input =
+                            ck::type_convert<ComputeDataType>(arg.b_ns_ks_(n0, n1, k0, k1));
                        AccDataType v_a;
                        AccDataType v_b;
-                        arg.a_element_op_(
+                        arg.a_element_op_(v_a, ck::type_convert<AccDataType>(v_a_compute_input));
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(v_b, ck::type_convert<AccDataType>(v_b_compute_input));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
                        v_acc += v_a * v_b;
                    }
                }
-                arg.c_ms_ns_(m0, m1, n0, n1) = v_acc;
+                arg.c_ms_ns_(m0, m1, n0, n1) = ck::type_convert<CDataType>(v_acc);
            };
            make_ParallelTensorFunctor(f_ms_ns,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
@@ -25,6 +25,8 @@ template <ck::index_t NDimSpatial,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation,
+          typename ComputeTypeA                                                     = OutDataType,
+          typename ComputeTypeB                                                     = InDataType,
          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
 struct ReferenceConvBwdWeight : public device::BaseOperator
 {
@@ -98,8 +100,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                            if(wi >= 0 &&
                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
                            {
-                                float v_out;
+                                ComputeTypeA v_out;
-                                float v_in;
+                                ComputeTypeB v_in;
                                arg.out_element_op_(
                                    v_out, ck::type_convert<float>(arg.output_(g, n, k, wo)));
@@ -107,7 +109,7 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                arg.in_element_op_(
                                    v_in, ck::type_convert<float>(arg.input_(g, n, c, wi)));
-                                v_acc += v_out * v_in;
+                                v_acc += type_convert<float>(v_out) * type_convert<float>(v_in);
                            }
                        }
                    }
@@ -158,8 +160,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                   wi >= 0 &&
                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
                                {
-                                    float v_out;
+                                    ComputeTypeA v_out;
-                                    float v_in;
+                                    ComputeTypeB v_in;
                                    arg.out_element_op_(
                                        v_out,
@@ -168,7 +170,7 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                    arg.in_element_op_(
                                        v_in, ck::type_convert<float>(arg.input_(g, n, c, hi, wi)));
-                                    v_acc += v_out * v_in;
+                                    v_acc += type_convert<float>(v_out) * type_convert<float>(v_in);
                                }
                            }
                        }
@@ -226,8 +228,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                       ck::type_convert<std::size_t>(wi) <
                                           arg.input_.GetLengths()[5])
                                    {
-                                        float v_out;
+                                        ComputeTypeA v_out;
-                                        float v_in;
+                                        ComputeTypeB v_in;
                                        arg.out_element_op_(v_out,
                                                            ck::type_convert<float>(
@@ -237,7 +239,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                                           ck::type_convert<float>(
                                                               arg.input_(g, n, c, di, hi, wi)));
-                                        v_acc += v_out * v_in;
+                                        v_acc +=
+                                            type_convert<float>(v_out) * type_convert<float>(v_in);
                                    }
                                }
                            }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -21,7 +21,8 @@ template <typename ADataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          typename ComputType = ADataType>
+          typename ComputeTypeA = ADataType,
+          typename ComputeTypeB = ComputeTypeA>
 struct ReferenceGemm : public device::BaseOperator
 {
    // Argument
@@ -65,8 +66,8 @@ struct ReferenceGemm : public device::BaseOperator
                for(int k = 0; k < K; ++k)
                {
-                    ComputType v_a;
+                    ComputeTypeA v_a;
-                    ComputType v_b;
+                    ComputeTypeB v_b;
                    // use PassThrough instead of ConvertBF16RTN for reference calculation
                    if constexpr(is_same_v<AElementwiseOperation,

--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -29,6 +29,8 @@ using BF8 = ck::bf8_t;
 using Empty_Tuple = ck::Tuple<>;
+using BF16_Tuple = ck::Tuple<BF16>;
 using F16_Tuple     = ck::Tuple<F16>;
 using F16_F16_Tuple = ck::Tuple<F16, F16>;

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using F64  = double;
+using F16_Tuple   = ck::Tuple<F16>;
+using BF16_Tuple  = ck::Tuple<BF16>;
+using F32_Tuple   = ck::Tuple<F32>;
+using F64_Tuple   = ck::Tuple<F64>;
+using Empty_Tuple = ck::Tuple<>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_kk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
+    // clang-format on
+    >;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_kn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,      
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,      
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,      
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_mk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_mn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_kk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
+    // clang-format on
+    >;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_kn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_mk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_mn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -17,7 +17,6 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
-// float
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -28,7 +27,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,7 +40,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -52,7 +53,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -64,10 +66,115 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
-#endif
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
-// double
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -78,7 +185,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -90,7 +198,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -102,7 +211,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,8 +224,170 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
-#endif
+                                                           F64>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
 // Contraction + Bilinear
 template <index_t NumDimM,
          index_t NumDimN,
@@ -123,7 +395,8 @@ template <index_t NumDimM,
          typename ADataType,
          typename BDataType,
          typename DDataType,
-          typename EDataType>
+          typename EDataType,
+          typename ComputeDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -134,7 +407,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Bilinear>>
+    ck::tensor_operation::element_wise::Bilinear,
+    ComputeDataType>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -145,45 +419,125 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Bilinear>;
+                                                ck::tensor_operation::element_wise::Bilinear,
+                                                ComputeDataType>;
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<DDataType, float> && is_same_v<EDataType, float>)
+                     is_same_v<EDataType, float>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                if constexpr(is_same_v<ComputeDataType, float>)
-                    op_ptrs);
+                {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
-                    op_ptrs);
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
-                     is_same_v<DDataType, double> && is_same_v<EDataType, double>)
+                     is_same_v<EDataType, double>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, double>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
+                     is_same_v<EDataType, ck::half_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<EDataType, ck::bhalf_t>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                if constexpr(is_same_v<ComputeDataType, float>)
-                    op_ptrs);
+                {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
-                    op_ptrs);
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_BF16
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -17,7 +17,6 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
-// float
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -28,7 +27,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,7 +40,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -52,7 +53,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -64,10 +66,115 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
-#endif
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
-// double
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -78,7 +185,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -90,7 +198,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -102,7 +211,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,15 +224,178 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
-#endif
+                                                           F64>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
 // Contraction + Scale
 template <index_t NumDimM,
          index_t NumDimN,
          index_t NumDimK,
          typename ADataType,
          typename BDataType,
-          typename EDataType>
+          typename EDataType,
+          typename ComputeDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -133,7 +406,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Scale>>
+    ck::tensor_operation::element_wise::Scale,
+    ComputeDataType>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -144,7 +418,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Scale>;
+                                                ck::tensor_operation::element_wise::Scale,
+                                                ComputeDataType>;
    static auto GetInstances()
    {
@@ -155,34 +430,113 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                if constexpr(is_same_v<ComputeDataType, float>)
-                    op_ptrs);
+                {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
-                    op_ptrs);
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<EDataType, double>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+                if constexpr(is_same_v<ComputeDataType, double>)
-                    op_ptrs);
+                {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
-                    op_ptrs);
+                        op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
-                    op_ptrs);
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
+                     is_same_v<EDataType, ck::half_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<EDataType, ck::bhalf_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_BF16
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -18,6 +18,8 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using BF8  = ck::bf8_t;
+using F8   = ck::f8_t;
 using Empty_Tuple = ck::Tuple<>;
@@ -143,6 +145,43 @@ using device_grouped_conv_bwd_data_xdl_f32_instances =
        // clang-format on
        >;
+// f16_f16_f16_comp_f8
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_input_fp16_comp_bf8f8_instances =
+    std::tuple<
+        // clang-format off
+         // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+         // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 16, 1, 4>,                1,  LoopScheduler::Default, BF8, F8>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 32, 1, 4>,                1,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8,  LoopScheduler::Default, BF8, F8>
+        // clang-format on
+        >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
@@ -19,6 +19,14 @@ using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+#ifdef CK_ENABLE_FP8
+using F8 = ck::f8_t;
+#endif
+#ifdef CK_ENABLE_BF8
+using BF8 = ck::bf8_t;
+#endif
 using Empty_Tuple = ck::Tuple<>;
 template <ck::index_t... Is>
@@ -133,6 +141,43 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple<
    // clang-format on
    >;
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_instances = std::tuple<
+// clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        |
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2,       BF8,     F8>,   
+        // instance for small conv.K
+        // for fp16 conv.K and conv.C must be divisible by 2
+        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2,       BF8,     F8>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>
+#endif
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation