Merge branch 'develop' into amd-develop

082cf643 · Jun Liu · 7e8230da · 59136091 · 082cf643 · 082cf643
Commit 082cf643 authored Oct 05, 2023 by Jun Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -31,7 +31,7 @@ namespace ck {
 //   D0, D1, ... and E have the same layout
 template <typename ADataType,
          typename BDataType,
-          typename ComputeDataType_,
+          typename AComputeDataType_,
          typename AccDataType,
          typename CShuffleDataType,
          typename DsDataType,
@@ -72,7 +72,8 @@ template <typename ADataType,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched,
-          PipelineVersion PipelineVer = PipelineVersion::v1>
+          PipelineVersion PipelineVer = PipelineVersion::v1,
+          typename BComputeDataType   = AComputeDataType_>
 struct GridwiseGemmMultipleD_xdl_cshuffle
 {
    static constexpr index_t NumDTensor = DsDataType::Size();
@@ -100,10 +101,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
        decltype(GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;

 #if CK_WORKAROUND_DENORM_FIX
-    using ComputeDataType =
-        conditional_t<is_same_v<ComputeDataType_, ck::half_t>, ck::bhalf_t, ComputeDataType_>;
+    using AComputeDataType =
+        conditional_t<is_same_v<AComputeDataType_, ck::half_t>, ck::bhalf_t, AComputeDataType_>;
 #else
-    using ComputeDataType = ComputeDataType_;
+    using AComputeDataType = AComputeDataType_;
 #endif

    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
@@ -172,8 +173,8 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
        constexpr auto c_block_size =
            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();

-        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
-                             sizeof(ComputeDataType),
+        return math::max(a_block_space_size_aligned * sizeof(AComputeDataType) +
+                             b_block_space_size_aligned * sizeof(BComputeDataType),
                         c_block_size * sizeof(CShuffleDataType));
    }

@@ -502,7 +503,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
                                                ABlockTransferThreadClusterArrangeOrder,
                                                ADataType,
-                                                ComputeDataType,
+                                                AComputeDataType,
                                                decltype(a_grid_desc_ak0_m_ak1),
                                                decltype(a_block_desc_ak0_m_ak1),
                                                ABlockTransferSrcAccessOrder,
@@ -533,7 +534,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
                                                BBlockTransferThreadClusterArrangeOrder,
                                                BDataType,
-                                                ComputeDataType,
+                                                BComputeDataType,
                                                decltype(b_grid_desc_bk0_n_bk1),
                                                decltype(b_block_desc_bk0_n_bk1),
                                                BBlockTransferSrcAccessOrder,
@@ -561,14 +562,15 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ComputeDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1),
+            MfmaSelector<AComputeDataType, MPerXdl, NPerXdl, BComputeDataType>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
-            ComputeDataType,
-            ComputeDataType,
+            AComputeDataType,
+            BComputeDataType,
            AccDataType,
            decltype(a_block_desc_ak0_m_ak1),
            decltype(b_block_desc_bk0_n_bk1),
@@ -586,10 +588,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);

        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<AComputeDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());

        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ComputeDataType*>(p_shared) + a_block_space_size_aligned,
+            static_cast<BComputeDataType*>(p_shared) + a_block_space_size_aligned,
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());

        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -139,7 +139,8 @@ __host__ __device__ constexpr auto make_merge_transform_v4_no_carry(const LowLen
 }

 template <typename GridwiseGemm,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatC,
          typename AGridDesc_B_K0_M_K1,
          typename BGridDesc_B_K0_N_K1,
@@ -153,8 +154,8 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_bwd_weight(const FloatAB* __restrict__ p_a_grid,
-                                      const FloatAB* __restrict__ p_b_grid,
+        kernel_gemm_xdlops_bwd_weight(const FloatA* __restrict__ p_a_grid,
+                                      const FloatB* __restrict__ p_b_grid,
                                      FloatC* __restrict__ p_c_grid,
                                      const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
                                      const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
@@ -195,7 +196,8 @@ __global__ void
 }

 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename FloatC,
          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
@@ -242,7 +244,9 @@ template <index_t BlockSize,
          bool ABlockLdsExtraM1Wrw      = false,
          bool BBlockLdsExtraN1Wrw      = false,
          index_t NumGemmKPrefetchStage = 1,
-          PipelineVersion PipelineVer   = PipelineVersion::v1>
+          PipelineVersion PipelineVer   = PipelineVersion::v1,
+          typename ComputeTypeA         = FloatA,
+          typename ComputeTypeB         = ComputeTypeA>
 struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
 {
    static constexpr auto I0 = Number<0>{};
@@ -265,11 +269,16 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    // denorm test fix, required to work around fp16 mfma issue
    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
    // when mfma if fixed, remove this section and update
-    // FloatABAdjusted -> FloatAB throughout this file
+    // FloatAAdjusted -> ComputeTypeA, FloatBAdjusted -> ComputeTypeB,
+    // throughout this file
 #if CK_WORKAROUND_DENORM_FIX
-    using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
+    using FloatAAdjusted =
+        conditional_t<is_same_v<ComputeTypeA, ck::half_t>, ck::bhalf_t, ComputeTypeA>;
+    using FloatBAdjusted =
+        conditional_t<is_same_v<ComputeTypeB, ck::half_t>, ck::bhalf_t, ComputeTypeB>;
 #else
-    using FloatABAdjusted = FloatAB;
+    using FloatAAdjusted = ComputeTypeA;
+    using FloatBAdjusted = ComputeTypeB;
 #endif

    // M0/M1/M1Padding
@@ -506,7 +515,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        constexpr auto c_block_size =
            GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock().GetElementSpaceSize();

-        return math::max((a_block_space_size + b_block_space_size) * sizeof(FloatAB),
+        return math::max((a_block_space_size * sizeof(FloatAAdjusted) +
+                          b_block_space_size * sizeof(FloatBAdjusted)),
                         c_block_size * sizeof(FloatC));
    }

@@ -610,8 +620,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));

    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
-                               const FloatAB* __restrict__ p_b_grid,
+    __device__ static void Run(const FloatA* __restrict__ p_a_grid,
+                               const FloatB* __restrict__ p_b_grid,
                               FloatC* __restrict__ p_c_grid,
                               void* __restrict__ p_shared,
                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
@@ -673,8 +683,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                Sequence<1, K0PerBlock, MPerBlock, K1>,
                                                ABlockTransferThreadClusterLengths_K0_M_K1,
                                                ABlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
-                                                FloatABAdjusted,
+                                                FloatA,
+                                                FloatAAdjusted,
                                                decltype(a_b_k0_m_k1_grid_desc),
                                                decltype(a_b_k0_m_k1_block_desc),
                                                ABlockTransferSrcAccessOrder,
@@ -703,8 +713,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                Sequence<1, K0PerBlock, NPerBlock, K1>,
                                                BBlockTransferThreadClusterLengths_K0_N_K1,
                                                BBlockTransferThreadClusterArrangeOrder,
-                                                FloatAB,
-                                                FloatABAdjusted,
+                                                FloatB,
+                                                FloatBAdjusted,
                                                decltype(b_b_k0_n_k1_grid_desc),
                                                decltype(b_b_k0_n_k1_block_desc),
                                                BBlockTransferSrcAccessOrder,
@@ -733,12 +743,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        // sanity check

        constexpr index_t KPack =
-            math::max(K1, MfmaSelector<FloatABAdjusted, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
+            math::max(K1,
+                      MfmaSelector<FloatAAdjusted, MPerXDL, NPerXDL, FloatBAdjusted>::selected_mfma
+                          .k_per_blk);

        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatABAdjusted,
-                                                                FloatABAdjusted,
+                                                                FloatAAdjusted,
+                                                                FloatBAdjusted,
                                                                FloatAcc,
                                                                decltype(a_k0_m_k1_block_desc),
                                                                decltype(b_k0_n_k1_block_desc),
@@ -758,10 +770,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);

        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatABAdjusted*>(p_shared), a_k0_m_k1_block_desc.GetElementSpaceSize());
+            static_cast<FloatAAdjusted*>(p_shared), a_k0_m_k1_block_desc.GetElementSpaceSize());

        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<FloatABAdjusted*>(p_shared) + a_block_space_size,
+            static_cast<FloatBAdjusted*>(p_shared) + a_block_space_size,
            b_k0_n_k1_block_desc.GetElementSpaceSize());

        // gridwise GEMM pipeline

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -32,8 +32,12 @@ enum struct MfmaInstr
    mfma_f64_16x16x4f64,
    mfma_f32_32x32x16f8f8,
    mfma_f32_16x16x32f8f8,
+    mfma_f32_32x32x16bf8bf8,
+    mfma_f32_16x16x32bf8bf8,
    mfma_f32_32x32x16f8bf8,
-    mfma_f32_16x16x32f8bf8
+    mfma_f32_16x16x32f8bf8,
+    mfma_f32_32x32x16bf8f8,
+    mfma_f32_16x16x32bf8f8
 };

 template <MfmaInstr instr>
@@ -504,6 +508,52 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8f8>
 };
 #endif

+#if defined CK_ENABLE_BF8
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16bf8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8bf8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32bf8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+#endif
+
 #if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8bf8>
@@ -550,6 +600,52 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8bf8>
 };
 #endif

+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8f8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16bf8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32bf8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+#endif
+
 template <typename base_type,
          index_t MPerXdlops,
          index_t NPerXdlops,
@@ -710,6 +806,20 @@ struct MfmaSelector
    }
 #endif

+#if defined CK_ENABLE_BF8
+    template <>
+    static constexpr auto GetMfma<bf8_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16bf8bf8;
+    }
+
+    template <>
+    static constexpr auto GetMfma<bf8_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_f32_16x16x32bf8bf8;
+    }
+#endif
+
 #if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
    template <>
    static constexpr auto GetMfma<f8_t, 32, 32, bf8_t>()
@@ -724,6 +834,20 @@ struct MfmaSelector
    }
 #endif

+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+    template <>
+    static constexpr auto GetMfma<bf8_t, 32, 32, f8_t>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16bf8f8;
+    }
+
+    template <>
+    static constexpr auto GetMfma<bf8_t, 16, 16, f8_t>()
+    {
+        return MfmaInstr::mfma_f32_16x16x32bf8f8;
+    }
+#endif
+
    static constexpr auto selected_mfma =
        mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops, additional_type>()>{};

@@ -931,8 +1055,12 @@ struct XdlopsGemm
 #if defined CK_ENABLE_FP8
                || is_same<base_type, f8_t>::value
 #endif
+#if defined CK_ENABLE_BF8
+                || is_same<base_type, bf8_t>::value
+#endif
 #if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-                || (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value)
+                || (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value) ||
+                (is_same<base_type, bf8_t>::value && is_same<additional_type, f8_t>::value)
 #endif
                ,
            "base base_type must be double, float, half, bfloat16, int8_t, f8_t or bf8_t!");

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -420,6 +420,71 @@ struct intrin_mfma_f32_16x16x32f8f8<16, 16>
 };
 #endif

+#if defined CK_ENABLE_BF8
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16bf8bf8;
+
+template <>
+struct intrin_mfma_f32_32x32x16bf8bf8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<bf8_t, 8> reg_b_v(reg_b);
+
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<bf8_t>()[Number<k>{}]);
+
+            intrin_mfma_f32_32x32x2f32<32, 32>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32bf8bf8;
+
+template <>
+struct intrin_mfma_f32_16x16x32bf8bf8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float4_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<bf8_t, 8> reg_b_v(reg_b);
+
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<bf8_t>()[Number<k>{}]);
+
+            intrin_mfma_f32_16x16x4f32<16, 16>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+#endif
+
 #if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16f8bf8;
@@ -484,5 +549,70 @@ struct intrin_mfma_f32_16x16x32f8bf8<16, 16>
    }
 };
 #endif
+
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16bf8f8;
+
+template <>
+struct intrin_mfma_f32_32x32x16bf8f8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<f8_t, 8> reg_b_v(reg_b);
+
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<f8_t>()[Number<k>{}]);
+
+            intrin_mfma_f32_32x32x2f32<32, 32>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32bf8f8;
+
+template <>
+struct intrin_mfma_f32_16x16x32bf8f8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float4_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+#else
+        vector_type<bf8_t, 8> reg_a_v(reg_a);
+        vector_type<f8_t, 8> reg_b_v(reg_b);
+
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<bf8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<f8_t>()[Number<k>{}]);
+
+            intrin_mfma_f32_16x16x4f32<16, 16>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+#endif
 } // namespace ck
 #endif
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -221,7 +221,7 @@ inline __host__ __device__ bf8_t type_convert<bf8_t, half_t>(half_t x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
    // convert to float and use native converion
-    return type_convert<f8_t>(type_convert<float>(x));
+    return type_convert<bf8_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
@@ -23,7 +23,6 @@ template <ck::index_t NumDimM,
          typename BDataType,
          typename CDataType,
          typename AccDataType,
-          typename ComputeDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
@@ -70,24 +69,19 @@ struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::Base
                {
                    for(ck::index_t k1 = 0; k1 < K1; ++k1)
                    {
-                        // Simulate the possible casting when ComputeDataType is different than the
-                        // A/B data types
-                        ComputeDataType v_a_compute_input =
-                            ck::type_convert<ComputeDataType>(arg.a_ms_ks_(m0, m1, k0, k1));
-                        ComputeDataType v_b_compute_input =
-                            ck::type_convert<ComputeDataType>(arg.b_ns_ks_(n0, n1, k0, k1));
-
                        AccDataType v_a;
                        AccDataType v_b;

-                        arg.a_element_op_(v_a, ck::type_convert<AccDataType>(v_a_compute_input));
-                        arg.b_element_op_(v_b, ck::type_convert<AccDataType>(v_b_compute_input));
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));

                        v_acc += v_a * v_b;
                    }
                }

-                arg.c_ms_ns_(m0, m1, n0, n1) = ck::type_convert<CDataType>(v_acc);
+                arg.c_ms_ns_(m0, m1, n0, n1) = v_acc;
            };

            make_ParallelTensorFunctor(f_ms_ns,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
@@ -25,6 +25,8 @@ template <ck::index_t NDimSpatial,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation,
+          typename ComputeTypeA                                                     = OutDataType,
+          typename ComputeTypeB                                                     = InDataType,
          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
 struct ReferenceConvBwdWeight : public device::BaseOperator
 {
@@ -98,8 +100,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                            if(wi >= 0 &&
                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
                            {
-                                float v_out;
-                                float v_in;
+                                ComputeTypeA v_out;
+                                ComputeTypeB v_in;

                                arg.out_element_op_(
                                    v_out, ck::type_convert<float>(arg.output_(g, n, k, wo)));
@@ -107,7 +109,7 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                arg.in_element_op_(
                                    v_in, ck::type_convert<float>(arg.input_(g, n, c, wi)));

-                                v_acc += v_out * v_in;
+                                v_acc += type_convert<float>(v_out) * type_convert<float>(v_in);
                            }
                        }
                    }
@@ -158,8 +160,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                   wi >= 0 &&
                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
                                {
-                                    float v_out;
-                                    float v_in;
+                                    ComputeTypeA v_out;
+                                    ComputeTypeB v_in;

                                    arg.out_element_op_(
                                        v_out,
@@ -168,7 +170,7 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                    arg.in_element_op_(
                                        v_in, ck::type_convert<float>(arg.input_(g, n, c, hi, wi)));

-                                    v_acc += v_out * v_in;
+                                    v_acc += type_convert<float>(v_out) * type_convert<float>(v_in);
                                }
                            }
                        }
@@ -226,8 +228,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                       ck::type_convert<std::size_t>(wi) <
                                           arg.input_.GetLengths()[5])
                                    {
-                                        float v_out;
-                                        float v_in;
+                                        ComputeTypeA v_out;
+                                        ComputeTypeB v_in;

                                        arg.out_element_op_(v_out,
                                                            ck::type_convert<float>(
@@ -237,7 +239,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                                           ck::type_convert<float>(
                                                               arg.input_(g, n, c, di, hi, wi)));

-                                        v_acc += v_out * v_in;
+                                        v_acc +=
+                                            type_convert<float>(v_out) * type_convert<float>(v_in);
                                    }
                                }
                            }

--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -29,8 +29,6 @@ using BF8 = ck::bf8_t;

 using Empty_Tuple = ck::Tuple<>;

-using BF16_Tuple = ck::Tuple<BF16>;
-
 using F16_Tuple     = ck::Tuple<F16>;
 using F16_F16_Tuple = ck::Tuple<F16, F16>;


--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
-using F64  = double;
-
-using F16_Tuple   = ck::Tuple<F16>;
-using BF16_Tuple  = ck::Tuple<BF16>;
-using F32_Tuple   = ck::Tuple<F32>;
-using F64_Tuple   = ck::Tuple<F64>;
-using Empty_Tuple = ck::Tuple<>;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-using Scale       = ck::tensor_operation::element_wise::Scale;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_kk_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
-    // clang-format on
-    >;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_kn_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,      
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,      
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,      
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-    // clang-format on
-    >;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_mk_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-    // clang-format on
-    >;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_mn_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-    // clang-format on
-    >;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_f64_kk_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
-    // clang-format on
-    >;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_f64_kn_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-    // clang-format on
-    >;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_f64_mk_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
-    // clang-format on
-    >;
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename DsDataType,
-          typename EDataType,
-          typename ComputeDataType,
-          typename AElementwiseOp,
-          typename BElementwiseOp,
-          typename CDEElementwiseOp>
-using device_contraction_f64_mn_instance = std::tuple<
-    // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|         Compute|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|            Data|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |           |          |            |                 |           |          |            Type|      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |           |          |            |                 |           |          |                |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ComputeDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-    // clang-format on
-    >;
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -17,6 +17,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
+// float
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -27,8 +28,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
+                                                           Bilinear>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,8 +40,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
+                                                           Bilinear>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -53,8 +52,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
+                                                           Bilinear>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -66,115 +64,10 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances);
-#endif // CK_ENABLE_FP32
-
+                                                           Bilinear>>>& instances);
+#endif
 #ifdef CK_ENABLE_FP64
+// double
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -185,8 +78,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances);
+                                                           Bilinear>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -198,8 +90,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances);
+                                                           Bilinear>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -211,8 +102,7 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances);
+                                                           Bilinear>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -224,170 +114,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-#endif // CK_ENABLE_FP64
-
-#ifdef CK_ENABLE_FP16
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-#endif // CK_ENABLE_FP16
-
-#ifdef CK_ENABLE_BF16
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances);
-#endif // CK_ENABLE_FP16
-
+                                                           Bilinear>>>& instances);
+#endif
 // Contraction + Bilinear
 template <index_t NumDimM,
          index_t NumDimN,
@@ -395,8 +123,7 @@ template <index_t NumDimM,
          typename ADataType,
          typename BDataType,
          typename DDataType,
-          typename EDataType,
-          typename ComputeDataType>
+          typename EDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -407,8 +134,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Bilinear,
-    ComputeDataType>>
+    ck::tensor_operation::element_wise::Bilinear>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -419,125 +145,45 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Bilinear,
-                                                ComputeDataType>;
+                                                ck::tensor_operation::element_wise::Bilinear>;

    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<EDataType, float>)
+                     is_same_v<DDataType, float> && is_same_v<EDataType, float>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                if constexpr(is_same_v<ComputeDataType, float>)
-                {
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
-                        op_ptrs);
                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
                    op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
                    op_ptrs);
                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
                    op_ptrs);
-                }
-                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
-                {
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
-                        op_ptrs);
-                }
-                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
-                {
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
                    op_ptrs);
            }
        }
-        }
-#endif // CK_ENABLE_FP32
+#endif
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
-                     is_same_v<EDataType, double>)
+                     is_same_v<DDataType, double> && is_same_v<EDataType, double>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-            {
-                if constexpr(is_same_v<ComputeDataType, double>)
            {
                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
                    op_ptrs);
                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
                    op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
-                        op_ptrs);
                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
                    op_ptrs);
-                }
-                else if constexpr(is_same_v<ComputeDataType, float>)
-                {
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
-                        op_ptrs);
-                }
-            }
-        }
-#endif // CK_ENABLE_FP64
-#ifdef CK_ENABLE_FP16
-        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
-                     is_same_v<EDataType, ck::half_t>)
-        {
-            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-            {
-                if constexpr(is_same_v<ComputeDataType, float>)
-                {
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
-                        op_ptrs);
-                }
-            }
-        }
-#endif // CK_ENABLE_FP16
-#ifdef CK_ENABLE_BF16
-        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
-                     is_same_v<EDataType, ck::bhalf_t>)
-        {
-            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-            {
-                if constexpr(is_same_v<ComputeDataType, float>)
-                {
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
-                        op_ptrs);
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
                    op_ptrs);
            }
        }
-        }
-#endif // CK_ENABLE_BF16
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -17,6 +17,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
+// float
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -27,8 +28,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
+                                                           Scale>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,8 +40,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
+                                                           Scale>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -53,8 +52,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
+                                                           Scale>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -66,115 +64,10 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances);
-#endif // CK_ENABLE_FP32
-
+                                                           Scale>>>& instances);
+#endif
 #ifdef CK_ENABLE_FP64
+// double
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -185,8 +78,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances);
+                                                           Scale>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -198,8 +90,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances);
+                                                           Scale>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -211,8 +102,7 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances);
+                                                           Scale>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -224,178 +114,15 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-#endif // CK_ENABLE_FP64
-
-#ifdef CK_ENABLE_FP16
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-#endif // CK_ENABLE_FP16
-
-#ifdef CK_ENABLE_BF16
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances);
-#endif // CK_ENABLE_FP16
-
+                                                           Scale>>>& instances);
+#endif
 // Contraction + Scale
 template <index_t NumDimM,
          index_t NumDimN,
          index_t NumDimK,
          typename ADataType,
          typename BDataType,
-          typename EDataType,
-          typename ComputeDataType>
+          typename EDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -406,8 +133,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Scale,
-    ComputeDataType>>
+    ck::tensor_operation::element_wise::Scale>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -418,8 +144,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Scale,
-                                                ComputeDataType>;
+                                                ck::tensor_operation::element_wise::Scale>;

    static auto GetInstances()
    {
@@ -429,8 +154,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                     is_same_v<EDataType, float>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-            {
-                if constexpr(is_same_v<ComputeDataType, float>)
            {
                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
                    op_ptrs);
@@ -441,38 +164,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
                    op_ptrs);
            }
-                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
-                {
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
-                        op_ptrs);
-                }
-                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
-                {
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
-                        op_ptrs);
-                }
-            }
        }
-#endif // CK_ENABLE_FP32
+#endif
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<EDataType, double>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-            {
-                if constexpr(is_same_v<ComputeDataType, double>)
            {
                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
                    op_ptrs);
@@ -483,60 +181,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
                    op_ptrs);
            }
-                else if constexpr(is_same_v<ComputeDataType, float>)
-                {
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
-                        op_ptrs);
-                }
-            }
-        }
-#endif // CK_ENABLE_FP64
-#ifdef CK_ENABLE_FP16
-        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
-                     is_same_v<EDataType, ck::half_t>)
-        {
-            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-            {
-                if constexpr(is_same_v<ComputeDataType, float>)
-                {
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
-                        op_ptrs);
-                }
-            }
-        }
-#endif // CK_ENABLE_FP16
-#ifdef CK_ENABLE_BF16
-        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
-                     is_same_v<EDataType, ck::bhalf_t>)
-        {
-            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-            {
-                if constexpr(is_same_v<ComputeDataType, float>)
-                {
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
-                        op_ptrs);
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
-                        op_ptrs);
-                }
-            }
        }
-#endif // CK_ENABLE_BF16
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -18,6 +18,8 @@ namespace instance {
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
+using BF8  = ck::bf8_t;
+using F8   = ck::f8_t;

 using Empty_Tuple = ck::Tuple<>;

@@ -143,6 +145,43 @@ using device_grouped_conv_bwd_data_xdl_f32_instances =
        // clang-format on
        >;

+// f16_f16_f16_comp_f8
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_input_fp16_comp_bf8f8_instances =
+    std::tuple<
+        // clang-format off
+         // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+         // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 16, 1, 4>,                1,  LoopScheduler::Default, BF8, F8>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 32, 1, 4>,                1,  LoopScheduler::Default, BF8, F8>,
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8,  LoopScheduler::Default, BF8, F8>
+        // clang-format on
+        >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
@@ -19,6 +19,14 @@ using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;

+#ifdef CK_ENABLE_FP8
+using F8 = ck::f8_t;
+#endif
+
+#ifdef CK_ENABLE_BF8
+using BF8 = ck::bf8_t;
+#endif
+
 using Empty_Tuple = ck::Tuple<>;

 template <ck::index_t... Is>
@@ -133,6 +141,43 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple<
    // clang-format on
    >;

+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_instances = std::tuple<
+// clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        |
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2,       BF8,     F8>,   
+        // instance for small conv.K
+        // for fp16 conv.K and conv.C must be divisible by 2
+        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2,       BF8,     F8>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,
+
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>
+#endif
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -13,6 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

+#ifdef CK_ENABLE_FP8
+using F8 = ck::f8_t;
+#endif
+
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
@@ -174,6 +178,42 @@ using device_grouped_conv_fwd_xdl_int8_instances = std::tuple<
    // clang-format on
    >;

+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f16_comp_f8_instances = std::tuple<
+// clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ComputeType|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |            |
+#ifdef CK_ENABLE_FP8
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>
+#endif
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -426,13 +426,32 @@ void add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_1x1s1p0_ins
                                                                  PassThrough,
                                                                  PassThrough>>>& instances);
 #endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  BF8,
+                                                                  F8>>>& instances);
+#endif
 template <ck::index_t NumDimSpatial,
          typename OutLayout,
          typename WeiLayout,
          typename InLayout,
          typename OutDataType,
          typename WeiDataType,
-          typename InDataType>
+          typename InDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB>
 struct DeviceOperationInstanceFactory<
    ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<
        NumDimSpatial,
@@ -446,7 +465,9 @@ struct DeviceOperationInstanceFactory<
        InDataType,
        ck::tensor_operation::element_wise::PassThrough,
        ck::tensor_operation::element_wise::PassThrough,
-        ck::tensor_operation::element_wise::PassThrough>>
+        ck::tensor_operation::element_wise::PassThrough,
+        ComputeTypeA,
+        ComputeTypeB>>
 {
    using DeviceOp =
        DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
@@ -460,7 +481,9 @@ struct DeviceOperationInstanceFactory<
                                          InDataType,
                                          ck::tensor_operation::element_wise::PassThrough,
                                          ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::PassThrough>;
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ComputeTypeA,
+                                          ComputeTypeB>;

    static auto GetInstances()
    {
@@ -597,7 +620,8 @@ struct DeviceOperationInstanceFactory<
            {
 #ifdef CK_ENABLE_FP16
                if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
-                             is_same_v<OutDataType, F16>)
+                             is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
+                             is_same_v<ComputeTypeB, F16>)
                {
                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
                        op_ptrs);
@@ -607,6 +631,15 @@ struct DeviceOperationInstanceFactory<
                        op_ptrs);
                }
 #endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+                else if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                                  is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, bf8_t> &&
+                                  is_same_v<ComputeTypeB, f8_t>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
+                        op_ptrs);
+                }
+#endif
 #ifdef CK_ENABLE_FP32
                else if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
                                  is_same_v<OutDataType, F32>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -216,6 +216,21 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
                                                           PassThrough,
                                                           PassThrough>>>& instances);
 #endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           BF8,
+                                                           F8>>>& instances);
+#endif

 #ifdef DL_KERNELS
 // dl
@@ -464,7 +479,9 @@ template <ck::index_t NumDimSpatial,
          typename OutLayout,
          typename InDataType,
          typename WeiDataType,
-          typename OutDataType>
+          typename OutDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvBwdWeight<
    NumDimSpatial,
    InLayout,
@@ -475,7 +492,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    OutDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough>>
+    ck::tensor_operation::element_wise::PassThrough,
+    ComputeTypeA,
+    ComputeTypeB>>
 {
    using DeviceOp = DeviceGroupedConvBwdWeight<NumDimSpatial,
                                                InLayout,
@@ -486,7 +505,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                                                OutDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough>;
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ComputeTypeA,
+                                                ComputeTypeB>;

    static auto GetInstances()
    {
@@ -706,7 +727,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 #endif
 #ifdef CK_ENABLE_FP16
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                                  is_same_v<OutDataType, half_t>)
+                                  is_same_v<OutDataType, half_t> &&
+                                  is_same_v<ComputeTypeA, half_t> &&
+                                  is_same_v<ComputeTypeB, half_t>)
                {
 #ifdef DL_KERNELS
                    add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
@@ -728,6 +751,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
                        op_ptrs);
                }
+#endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                                  is_same_v<OutDataType, half_t> &&
+                                  is_same_v<ComputeTypeA, bf8_t> && is_same_v<ComputeTypeB, f8_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+                        op_ptrs);
+                }
 #endif
            }
        }

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -16,6 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+
 #ifdef CK_ENABLE_BF16
 // grouped conv1d forward, GNWC/GKXC/GNWK
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(
@@ -32,6 +33,7 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
@@ -47,6 +49,7 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
@@ -62,6 +65,7 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
@@ -77,100 +81,90 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#ifdef CK_ENABLE_BF16
-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
-void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              BF16,
-                                                              BF16,
+                                                              NHWGK,
+                                                              int8_t,
+                                                              int8_t,
                                                              Empty_Tuple,
-                                                              BF16,
+                                                              int8_t,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
+                                                              NHWGK,
+                                                              int8_t,
+                                                              int8_t,
                                                              Empty_Tuple,
-                                                              F16,
+                                                              int8_t,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F32,
-                                                              F32,
+                                                              NHWGK,
+                                                              int8_t,
+                                                              int8_t,
                                                              Empty_Tuple,
-                                                              F32,
+                                                              int8_t,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#ifdef DL_KERNELS
-#ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
+                                                              NHWGK,
+                                                              int8_t,
+                                                              int8_t,
                                                              Empty_Tuple,
-                                                              F16,
+                                                              int8_t,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
+
+#ifdef CK_ENABLE_BF16
+// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
                                                              GKYXC,
                                                              Empty_Tuple,
                                                              GNHWK,
-                                                              F32,
-                                                              F32,
+                                                              BF16,
+                                                              BF16,
                                                              Empty_Tuple,
-                                                              F32,
+                                                              BF16,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#endif
-#ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);

-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
                                                              GKYXC,
@@ -183,22 +177,26 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
+#endif

-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
                                                              GKYXC,
                                                              Empty_Tuple,
                                                              GNHWK,
-                                                              F16,
-                                                              F16,
+                                                              F32,
+                                                              F32,
                                                              Empty_Tuple,
-                                                              F16,
+                                                              F32,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
+#endif

-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
                                                              GKYXC,
@@ -211,23 +209,8 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
-
-#ifdef DL_KERNELS
-void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F16,
-                                                              F16,
-                                                              Empty_Tuple,
-                                                              F16,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
-#endif
 #endif
+
 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
@@ -285,22 +268,7 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
-void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              F32,
-                                                              F32,
-                                                              Empty_Tuple,
-                                                              F32,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);

-#endif
 // grouped conv2d forward, NHWGC/GKYXC/NHWGK
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
@@ -317,6 +285,7 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
@@ -388,63 +357,7 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
-#ifdef CK_ENABLE_INT8
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
-
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);

-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
-
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
-#endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
@@ -460,6 +373,7 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
@@ -476,6 +390,7 @@ void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -547,6 +462,7 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -562,6 +478,7 @@ void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -633,6 +550,7 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_BF16
 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
@@ -649,6 +567,7 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -663,7 +582,9 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
+#endif

+#ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                              NDHWGC,
@@ -677,7 +598,9 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
+#endif

+#ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                              NDHWGC,
@@ -691,7 +614,9 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instances
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
+#endif

+#ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                              NDHWGC,
@@ -705,7 +630,9 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instanc
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
+#endif

+#ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                              NDHWGC,
@@ -720,6 +647,88 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP8
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_f8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              F8>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -735,6 +744,7 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);
 #endif
+
 #ifdef CK_ENABLE_INT8
 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -807,13 +817,79 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
                                                              PassThrough>>>& instances);
 #endif

+#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
+void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+#endif
+
+#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
+void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+#endif
+
+#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+#endif
+
+#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+#endif
+
 template <ck::index_t NumDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename OutLayout,
          typename InDataType,
          typename WeiDataType,
-          typename OutDataType>
+          typename OutDataType,
+          typename ComputeType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
    NumDimSpatial,
    InLayout,
@@ -826,7 +902,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    OutDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough>>
+    ck::tensor_operation::element_wise::PassThrough,
+    ComputeType>>
 {
    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                   InLayout,
@@ -839,7 +916,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                                                   OutDataType,
                                                   ck::tensor_operation::element_wise::PassThrough,
                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   ck::tensor_operation::element_wise::PassThrough>;
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ComputeType>;

    static auto GetInstances()
    {
@@ -877,7 +955,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
        {
 #ifdef CK_ENABLE_FP32
@@ -885,25 +964,37 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         is_same_v<OutDataType, float>)
            {
                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
-#ifdef DL_KERNELS
-                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
+            }
 #endif
+
+#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
            }
 #endif
+
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
            {
                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
-#ifdef DL_KERNELS
-                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
-#endif
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(op_ptrs);
            }
 #endif
+
+#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
+            }
+#endif
+
 #ifdef CK_ENABLE_BF16
            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
@@ -911,8 +1002,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs);
            }
 #endif
+
 #ifdef CK_ENABLE_INT8
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<OutDataType, int8_t>)
            {
                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs);
@@ -922,33 +1014,43 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
        {
+
 #ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
-#ifdef DL_KERNELS
-                add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
+            }
 #endif
+
+#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
            }
 #endif
+
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
            {
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
-#ifdef DL_KERNELS
-                add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
+            }
 #endif
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(op_ptrs);
+
+#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
            }
 #endif
+
 #ifdef CK_ENABLE_BF16
            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
@@ -967,7 +1069,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&
+
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&
                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, GNDHWK>)
        {
 #ifdef CK_ENABLE_FP32
@@ -1010,7 +1113,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
        {
 #ifdef CK_ENABLE_FP32
@@ -1020,9 +1124,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
            }
 #endif
+
+#ifdef CK_ENABLE_FP8
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, ck::f8_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_f8_instances(
+                    op_ptrs);
+            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                         is_same_v<OutDataType, half_t>)
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
            {
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(op_ptrs);

--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
@@ -111,6 +111,22 @@ struct GeneratorTensor_2<ck::f8_t>
 };
 #endif

+#if defined CK_ENABLE_BF8
+template <>
+struct GeneratorTensor_2<ck::bf8_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::bf8_t operator()(Is...)
+    {
+        float tmp = (std::rand() % (max_value - min_value)) + min_value;
+        return ck::type_convert<ck::bf8_t>(tmp);
+    }
+};
+#endif
+
 template <typename T>
 struct GeneratorTensor_3
 {
@@ -162,6 +178,25 @@ struct GeneratorTensor_3<ck::f8_t>
 };
 #endif

+#if defined CK_ENABLE_BF8
+template <>
+struct GeneratorTensor_3<ck::bf8_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ck::bf8_t operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        float fp32_tmp = min_value + tmp * (max_value - min_value);
+
+        return ck::type_convert<ck::bf8_t>(fp32_tmp);
+    }
+};
+#endif
+
 template <typename T>
 struct GeneratorTensor_4
 {

--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -51,7 +51,7 @@ function(add_instance_library INSTANCE_NAME)
        set(result 0)
    endif()
    #message("add_instance_library returns ${result}")
-    return(PROPAGATE result)
+    set(result ${result} PARENT_SCOPE)
 endfunction(add_instance_library INSTANCE_NAME)



--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
 set(DEVICE_CONTRACTION_BILINEAR_INSTANCES)
-
-# FP32
+#float
 list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp)

-list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp)
-
-list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp)
-
-# FP64
+#double
 list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
                                                  device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp)

-list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp)
-
-# FP16
-list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp)
-
-# BF16
-list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
-                                                    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp)
-
 add_instance_library(device_contraction_bilinear_instance ${DEVICE_CONTRACTION_BILINEAR_INSTANCES})