Merge branch 'develop' into navi3x_add_vectorload_check

d25fcb3d · zjing14 · GitHub · 270dc0a3 · 7613c1d9 · d25fcb3d
Unverified Commit d25fcb3d authored Apr 26, 2023 by zjing14 Committed by GitHub Apr 26, 2023
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -73,157 +73,18 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
    static constexpr auto I2 = Number<2>{};
    static constexpr auto I3 = Number<3>{};

-    static constexpr auto K1Number = Number<K1>{};
-
-    static auto
-    MakeAGridDescriptor_KBatch_K0_M_K1(index_t M, index_t K, index_t StrideA, int KBatch, int KPad)
-    {
-        assert(KPad % (K1 * KBatch) == 0);
-
-        const index_t K0 = KPad / (K1 * KBatch);
-
-        const auto a_grid_desc_m_k = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
-            a_grid_desc_m_k,
-            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_right_pad_transform(M, PadM)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_m_kpad,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    static auto
-    MakeBGridDescriptor_KBatch_K0_N_K1(index_t K, index_t N, index_t StrideB, int KBatch, int KPad)
-    {
-        assert(KPad % (K1 * KBatch) == 0);
-
-        const index_t K0 = KPad / (K1 * KBatch);
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
-            b_grid_desc_k_n,
-            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_kpad_n,
-                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
-            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto GetKPad(index_t K, index_t KBatch)
-    {
-        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
-        const index_t KPad = KBatch * K0 * K1;
-        return KPad;
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_KBatch_K0_M_K1(1, 1, 1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_KBatch_K0_N_K1(1, 1, 1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
-    // GridwiseGemm
    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
        BlockSize,
        ADataType, // TODO: distinguish A/B datatype
        AccDataType,
        CDataType,
-        InMemoryDataOperationEnum::Set,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
+        ALayout,
+        BLayout,
+        CLayout,
        AElementwiseOperation,
        BElementwiseOperation,
        CElementwiseOperation,
+        GemmSpec,
        MPerBlock,
        NPerBlock,
        K0PerBlock,
@@ -253,236 +114,68 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
        CBlockTransferScalarPerVector_NWaveNPerXDL,
        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;

-    // GridwiseGemm
-    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
-        BlockSize,
-        ADataType, // TODO: distinguish A/B datatype
-        AccDataType,
-        CDataType,
-        InMemoryDataOperationEnum::AtomicAdd,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CElementwiseOperation,
-        MPerBlock,
-        NPerBlock,
-        K0PerBlock,
-        MPerXDL,
-        NPerXDL,
-        K1,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_K0_M_K1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_K1,
-        false, // AThreadTransferSrcResetCoordinateAfterRun,
-        ABlockLdsAddExtraM,
-        BBlockTransferThreadClusterLengths_K0_N_K1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_K1,
-        false, // BThreadTransferSrcResetCoordinateAfterRun,
-        BBlockLdsAddExtraN,
-        CShuffleMRepeatPerShuffle,
-        CShuffleNRepeatPerShuffle,
-        CBlockTransferScalarPerVector_NWaveNPerXDL,
-        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
-
-    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
-
-    using Block2CTileMap = typename GridwiseGemm::CBlockClusterAdaptor;
-
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 CDataType* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t M01,
-                 index_t N01,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op,
-                 index_t k_batch)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_kbatch_k0_m_k1_{},
-              b_grid_desc_kbatch_k0_n_k1_{},
-              c_grid_desc_m_n_{},
-              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op},
-              k_batch_{k_batch}
-        {
-            int KPad = DeviceGemmXdlSplitKCShuffle::GetKPad(K, k_batch_);
-
-            a_grid_desc_kbatch_k0_m_k1_ =
-                DeviceGemmXdlSplitKCShuffle::MakeAGridDescriptor_KBatch_K0_M_K1(
-                    M, K, StrideA, k_batch_, KPad);
-            b_grid_desc_kbatch_k0_n_k1_ =
-                DeviceGemmXdlSplitKCShuffle::MakeBGridDescriptor_KBatch_K0_N_K1(
-                    K, N, StrideB, k_batch_, KPad);
-            c_grid_desc_m_n_ = DeviceGemmXdlSplitKCShuffle::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
-
-            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
-                                           b_grid_desc_kbatch_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
-                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        CDataType* p_c_grid_;
-        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        Block2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CElementwiseOperation c_element_op_;
-        index_t k_batch_;
-    };
+    using Argument              = typename GridwiseGemm::Argument;
+    using DefaultBlock2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;

    // Invoker
    struct Invoker : public BaseInvoker
    {
-        using Argument = DeviceGemmXdlSplitKCShuffle::Argument;

-        void Print(const Argument& arg)
-        {
-            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
-                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
-
-            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
-                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
-
-            std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
-        }
+        void Print(const Argument& karg) { karg.Print(); }

-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float Run(const Argument& karg, const StreamConfig& stream_config = StreamConfig{})
        {
            if(stream_config.log_level_ > 0)
            {
-                Print(arg);
+                Print(karg);
            }

-            const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
+            const auto kbatch = karg.k_batch;

-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                            arg.b_grid_desc_kbatch_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            if(!GridwiseGemm::CheckValidity(karg))
            {
                throw std::runtime_error(
-                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid setting");
+                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid "
+                    "setting");
            }

-            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
-
-            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+            const auto b2c_map = DefaultBlock2CTileMap{};
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
+            const auto K0           = karg.K0;

            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);

            float ave_time = 0;

            const auto Run = [&](const auto& kernel) {
-                hipGetErrorString(hipMemset(
-                    arg.p_c_grid_,
-                    0,
-                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                        sizeof(CDataType)));
-
-                ave_time =
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           arg.p_a_grid_,
-                                           arg.p_b_grid_,
-                                           arg.p_c_grid_,
-                                           arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.block_2_ctile_map_);
+                if(kbatch > 1)
+                    hipGetErrorString(
+                        hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(CDataType)));
+
+                ave_time = launch_and_time_kernel(
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg, b2c_map);
            };

            if(has_main_k0_block_loop)
            {
                if(kbatch == 1)
                {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        true>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             DefaultBlock2CTileMap>;

                    Run(kernel);
                }
                else
                {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemmAtomicAdd,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        true>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             true,
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             DefaultBlock2CTileMap>;

                    Run(kernel);
                }
@@ -491,37 +184,21 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
            {
                if(kbatch == 1)
                {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        false>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             false,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             DefaultBlock2CTileMap>;

                    Run(kernel);
                }
                else
                {
-                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
-                        GridwiseGemmAtomicAdd,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
-                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
-                        false>;
+                    const auto kernel =
+                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
+                                                             false,
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             DefaultBlock2CTileMap>;

                    Run(kernel);
                }
@@ -544,12 +221,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
        return true;
    }

-    static bool IsSupportedArgument(const Argument& arg)
+    static bool IsSupportedArgument(const Argument& karg)
    {
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
+        return GridwiseGemm::CheckValidity(karg);
    }

    // polymorphic
@@ -567,9 +241,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                             index_t StrideA,
                             index_t StrideB,
                             index_t StrideC,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation,
                             index_t KBatch)
    {
        return Argument{p_a,
@@ -581,11 +255,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                        StrideA,
                        StrideB,
                        StrideC,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op,
+                        GridwiseGemm::CalculateMPadded(M),
+                        GridwiseGemm::CalculateNPadded(N),
+                        GridwiseGemm::CalculateKPadded(K),
+                        GridwiseGemm::CalculateK0(K, KBatch),
                        KBatch};
    }

@@ -601,9 +274,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                      index_t StrideA,
                                                      index_t StrideB,
                                                      index_t StrideC,
-                                                      AElementwiseOperation a_element_op,
-                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation,
                                                      ck::index_t KBatch = 1) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
@@ -615,11 +288,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                          StrideA,
                                          StrideB,
                                          StrideC,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op,
+                                          GridwiseGemm::CalculateMPadded(M),
+                                          GridwiseGemm::CalculateNPadded(N),
+                                          GridwiseGemm::CalculateKPadded(K),
+                                          GridwiseGemm::CalculateK0(K, KBatch),
                                          KBatch);
    }

@@ -630,31 +302,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
    }

    // polymorphic
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceGemmXdlSplitKCShuffle"
-            << "<"
-            << BlockSize << ", "
-            << MPerBlock << ", "
-            << NPerBlock << ", "
-            << K0PerBlock << ", "
-            << K1 << ", "
-            << MPerXDL << ", "
-            << NPerXDL << ", "
-            << MXdlPerWave << ", "
-            << NXdlPerWave << ", "
-            << ABlockTransferSrcScalarPerVector << ", "
-            << ABlockTransferDstScalarPerVector_K1 << ", "
-            << BBlockTransferSrcScalarPerVector << ", "
-            << BBlockTransferDstScalarPerVector_K1
-            << ">";
-        // clang-format on
-
-        return str.str();
-    }
+    std::string GetTypeString() const override { return GridwiseGemm::GetTypeString(); }
 };

 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/ck.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename GemmDesc,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                       const index_t group_count)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+    __shared__ uint8_t p_shared[shared_size];
+
+    const index_t block_id = get_block_1d_id();
+    const auto gemm_desc_ptr =
+        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
+
+    index_t left     = 0;
+    index_t right    = group_count;
+    index_t group_id = index_t((left + right) / 2);
+    while((!(block_id >= gemm_desc_ptr[group_id].block_start_ &&
+             block_id < gemm_desc_ptr[group_id].block_end_)) &&
+          left <= right)
+    {
+        if(block_id < gemm_desc_ptr[group_id].block_start_)
+        {
+            right = group_id;
+        }
+        else
+        {
+            left = group_id;
+        }
+        group_id = index_t((left + right) / 2);
+    }
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+        gemm_desc_ptr[group_id].karg_,
+        static_cast<void*>(p_shared),
+        gemm_desc_ptr[group_id].block_2_ctile_map_);
+#else
+    ignore = gemm_descs_const;
+    ignore = group_count;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          ck::index_t NumPrefetch,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t AK1,
+          ck::index_t BK1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler(),
+          // Current implementation does not support multiple D fusions.
+          enable_if_t<AK1 == BK1 && is_same_v<DsLayout, ck::Tuple<>> &&
+                          is_same_v<DsDataType, ck::Tuple<>>,
+                      bool> = false>
+struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayout,
+                                                                           BLayout,
+                                                                           DsLayout,
+                                                                           ELayout,
+                                                                           ADataType,
+                                                                           BDataType,
+                                                                           DsDataType,
+                                                                           EDataType,
+                                                                           AElementwiseOperation,
+                                                                           BElementwiseOperation,
+                                                                           CDEElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static_assert(KPerBlock % AK1 == 0);
+    static constexpr index_t K0PerBlock = KPerBlock / AK1;
+
+    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        EDataType,
+        ALayout,
+        BLayout,
+        ELayout,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        GemmSpec,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        AK1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+
+    using CGridDesc_M_N = typename GridwiseGemm::CGridDesc_M_N;
+    using Block2ETileMapKSplit =
+        BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>;
+    // Block2CTileMap configuration parameter.
+    static constexpr index_t B2E_M01 = 8;
+    using GroupedGemmBlock2ETileMap  = OffsettedBlockToCTileMap<Block2ETileMapKSplit>;
+    using KernelArgument             = typename GridwiseGemm::Argument;
+
+    struct GemmTransKernelArg
+    {
+        KernelArgument karg_;
+        GroupedGemmBlock2ETileMap block_2_ctile_map_;
+        index_t block_start_, block_end_;
+
+        GemmTransKernelArg() = default;
+        GemmTransKernelArg(KernelArgument&& karg,
+                           GroupedGemmBlock2ETileMap&& b2c_map,
+                           index_t block_start,
+                           index_t block_end)
+            : karg_{karg},
+              block_2_ctile_map_{b2c_map},
+              block_start_{block_start},
+              block_end_{block_end}
+        {
+        }
+    };
+
+    static constexpr index_t DefaultKBatch = 1;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+
+        Argument(std::vector<const void*>& p_As,
+                 std::vector<const void*>& p_Bs,
+                 std::vector<void*>& p_Es,
+                 std::vector<GemmDesc>& gemm_descs)
+            : Argument(p_As, p_Bs, p_Es, gemm_descs, DefaultKBatch)
+        {
+            // TODO: use occupancy api to calculate appropriate batch size.
+        }
+
+        Argument(std::vector<const void*>& p_As,
+                 std::vector<const void*>& p_Bs,
+                 std::vector<void*>& p_Es,
+                 std::vector<GemmDesc>& gemm_descs,
+                 index_t kbatch)
+            : K_BATCH{kbatch}
+        {
+            grid_size_   = 0;
+            group_count_ = ck::type_convert<ck::index_t>(gemm_descs.size());
+
+            if(!(group_count_ == ck::type_convert<ck::index_t>(p_As.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_Bs.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_Es.size())))
+            {
+                throw std::runtime_error("wrong! group_count_ != p_As/b/c.size");
+            }
+
+            gemm_kernel_args_.reserve(group_count_);
+
+            skipped_group_count_ = 0;
+
+            for(std::size_t i = 0; i < gemm_descs.size(); ++i)
+            {
+                const index_t M = gemm_descs[i].M_;
+                const index_t N = gemm_descs[i].N_;
+                const index_t K = gemm_descs[i].K_;
+
+                if(M == 0)
+                {
+                    skipped_group_count_++;
+                    continue;
+                }
+
+                const index_t stride_a = gemm_descs[i].stride_A_;
+                const index_t stride_b = gemm_descs[i].stride_B_;
+                const index_t stride_c = gemm_descs[i].stride_C_;
+
+                const index_t m_padded = GridwiseGemm::CalculateMPadded(M);
+                const index_t n_padded = GridwiseGemm::CalculateNPadded(N);
+                const index_t k_padded = GridwiseGemm::CalculateKPadded(K, K_BATCH);
+                const index_t k0       = GridwiseGemm::CalculateK0(K, K_BATCH);
+
+                const auto c_grid_desc_m_n =
+                    GridwiseGemm::MakeCGridDescriptor_M_N(M, N, m_padded, n_padded, stride_c);
+
+                const auto local_b2c_tile_map =
+                    Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
+                const index_t grid_size_grp = local_b2c_tile_map.CalculateGridSize(c_grid_desc_m_n);
+
+                const index_t block_start = grid_size_;
+                const index_t block_end   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                // block-to-e-tile map
+                auto grouped_block_2_ctile_map =
+                    GroupedGemmBlock2ETileMap(local_b2c_tile_map, block_start);
+
+                auto karg = KernelArgument{type_convert<const ADataType*>(p_As[i]),
+                                           type_convert<const BDataType*>(p_Bs[i]),
+                                           type_convert<EDataType*>(p_Es[i]),
+                                           M,
+                                           N,
+                                           K,
+                                           stride_a,
+                                           stride_b,
+                                           stride_c,
+                                           m_padded,
+                                           n_padded,
+                                           k_padded,
+                                           k0,
+                                           K_BATCH};
+
+                gemm_kernel_args_.emplace_back(
+                    std::move(karg), std::move(grouped_block_2_ctile_map), block_start, block_end);
+            }
+        }
+
+        /**
+         * @brief      Recalculate group grid size for all gemms and update B2C maps.
+         *
+         * @param[in]  kbatch  The new splitK parameter value.
+         */
+        void UpdateKBatch(index_t kbatch)
+        {
+            K_BATCH    = kbatch;
+            grid_size_ = 0;
+
+            for(std::size_t i = 0; i < gemm_kernel_args_.size(); ++i)
+            {
+
+                auto& karg = gemm_kernel_args_[i].karg_;
+
+                const index_t k_padded = GridwiseGemm::CalculateKPadded(karg.K, K_BATCH);
+                const index_t k0       = GridwiseGemm::CalculateK0(karg.K, K_BATCH);
+
+                const auto c_grid_desc_m_n = GridwiseGemm::MakeCGridDescriptor_M_N(
+                    karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+
+                const auto local_b2c_tile_map =
+                    Block2ETileMapKSplit{c_grid_desc_m_n, B2E_M01, K_BATCH};
+                const index_t grid_size_grp = local_b2c_tile_map.CalculateGridSize(c_grid_desc_m_n);
+
+                const index_t block_start = grid_size_;
+                const index_t block_end   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                // block-to-e-tile map
+                auto grouped_block_2_ctile_map =
+                    GroupedGemmBlock2ETileMap(local_b2c_tile_map, block_start);
+
+                karg.KPadded                            = k_padded;
+                karg.K0                                 = k0;
+                karg.k_batch                            = K_BATCH;
+                gemm_kernel_args_[i].block_2_ctile_map_ = grouped_block_2_ctile_map;
+                gemm_kernel_args_[i].block_start_       = block_start;
+                gemm_kernel_args_[i].block_end_         = block_end;
+            }
+        }
+
+        //  private:
+        index_t K_BATCH;
+        index_t group_count_;
+        index_t skipped_group_count_;
+
+        std::vector<GemmTransKernelArg> gemm_kernel_args_;
+        index_t grid_size_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            index_t K0                       = arg.gemm_kernel_args_[0].karg_.K0;
+            bool all_have_kbatch_gt_one      = arg.gemm_kernel_args_[0].karg_.k_batch > 1;
+            bool all_have_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
+            {
+                const auto& karg = arg.gemm_kernel_args_[i].karg_;
+                if(stream_config.log_level_ > 0)
+                {
+                    karg.Print();
+                }
+
+                auto kbatch = karg.k_batch;
+
+                if(!GridwiseGemm::CheckValidity(karg))
+                {
+                    std::ostringstream err;
+                    err << "Group id: " << i << " has invalid GridwiseGemm settings!" << __FILE__
+                        << ":" << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+
+                K0 = karg.K0;
+                bool not_all_have_main_k0_block_loop_same =
+                    all_have_main_k0_block_loop xor GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+                bool not_all_have_kbatch_value_same = all_have_kbatch_gt_one xor (kbatch > 1);
+
+                if(not_all_have_main_k0_block_loop_same)
+                {
+                    std::ostringstream err;
+                    err << "Not all gemms have same value for main_k0_block_loop! in " << __FILE__
+                        << ":" << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+
+                if(not_all_have_kbatch_value_same)
+                {
+                    std::ostringstream err;
+                    err << "Not all gemms have same kbatch value (=1 or >1)! "
+                        << "group [" << i << "], kbatch: " << kbatch
+                        << ", group [0], kbatch: " << arg.gemm_kernel_args_[0].karg_.k_batch
+                        << " in " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                    throw std::runtime_error(err.str());
+                }
+            }
+
+            hip_check_error(hipMemcpy(arg.p_workspace_,
+                                      arg.gemm_kernel_args_.data(),
+                                      arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                                      hipMemcpyHostToDevice));
+
+            float ave_time = 0;
+
+            const auto Run = [&](const auto& kernel) {
+                if(all_have_kbatch_gt_one)
+                {
+                    for(const auto& trans_arg : arg.gemm_kernel_args_)
+                    {
+                        const auto& karg = trans_arg.karg_;
+                        hip_check_error(
+                            hipMemset(karg.p_c_grid, 0, karg.M * karg.N * sizeof(EDataType)));
+                    }
+                }
+
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(arg.grid_size_),
+                                           dim3(BlockSize),
+                                           0,
+                                           cast_pointer_to_constant_address_space(arg.p_workspace_),
+                                           arg.gemm_kernel_args_.size());
+            };
+
+            if(all_have_main_k0_block_loop)
+            {
+                if(all_have_kbatch_gt_one)
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       true,
+                                                       InMemoryDataOperationEnum::AtomicAdd>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       true,
+                                                       InMemoryDataOperationEnum::Set>;
+
+                    Run(kernel);
+                }
+            }
+            else
+            {
+                if(all_have_kbatch_gt_one)
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       false,
+                                                       InMemoryDataOperationEnum::AtomicAdd>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_splitk<GridwiseGemm,
+                                                       GemmTransKernelArg,
+                                                       false,
+                                                       InMemoryDataOperationEnum::Set>;
+
+                    Run(kernel);
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if((ck::type_convert<ck::index_t>(arg.gemm_kernel_args_.size()) +
+            arg.skipped_group_count_) != arg.group_count_)
+        {
+            return false;
+        }
+
+        bool supported = true;
+        for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
+        {
+            const auto& a        = arg.gemm_kernel_args_[i].karg_;
+            bool group_arg_valid = GridwiseGemm::CheckValidity(a);
+#if DEBUG_LOG
+            if(not group_arg_valid)
+            {
+                std::cout << "[" << __func__ << "] group id: " << i << " is not supported!\n";
+                a.Print();
+            }
+#endif // DEBUG_LOG
+            supported &= group_arg_valid;
+        }
+        return supported;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::vector<const void*>& p_As,
+                             std::vector<const void*>& p_Bs,
+                             std::vector<std::array<const void*, NumDTensor>>&,
+                             std::vector<void*>& p_Es,
+                             std::vector<GemmDesc> gemm_descs,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CDEElementwiseOperation)
+    {
+        return Argument{p_As, p_Bs, p_Es, gemm_descs};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*>& p_As,
+                        std::vector<const void*>& p_Bs,
+                        std::vector<std::array<const void*, NumDTensor>>&,
+                        std::vector<void*>& p_Es,
+                        std::vector<GemmDesc>& gemm_descs,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CDEElementwiseOperation) override
+    {
+        return std::make_unique<Argument>(p_As, p_Bs, p_Es, gemm_descs);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedGemm_XdlSplitK"
+            << "<"
+            << std::string(ALayout::name)[0] << ","
+            << std::string(BLayout::name)[0] << ","
+            << std::string(ELayout::name)[0] << ","
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        return dynamic_cast<const Argument*>(p_arg)->gemm_kernel_args_.size() *
+               sizeof(GemmTransKernelArg);
+    }
+
+    static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); }
+
+    // polymorphic
+    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
+    {
+        return SetKBatchSize(*dynamic_cast<Argument*>(p_arg), kbatch);
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -587,4 +587,52 @@ struct OffsettedBlockToCTileMap
    index_t block_start_;
 };

+/**
+ * @brief      Simple tile mapping which creates 3D grid of block of threads.
+ *
+ * @paragraph  Description
+ *             This Block-to-C-tile-map creates a 3D grid (n_blocks, m_blocks, z_blocks) of thread
+ *             blocks. The first 2D are regular 2D tiles created by division of output GEMM
+ *             dimenions by corresponding tile size. The third dimension (Z) is a k-split dimension,
+ *             which denotes the number of blocks we use to divide work on GEMM K dimension onto.
+ *
+ * @tparam     MPerBlock  Output block tile size in M dimension.
+ * @tparam     NPerBlock  Output block tile size in N dimension.
+ */
+template <index_t MPerBlock, index_t NPerBlock>
+struct BlockToCTileMap_3DGrid_KSplit
+{
+
+    __host__ __device__ BlockToCTileMap_3DGrid_KSplit() = default;
+
+    __host__ __device__ constexpr auto
+    CalculateGridSize(index_t M, index_t N, index_t k_split) const
+    {
+        // Create 3D grid
+        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(N, NPerBlock);
+
+        return std::make_tuple(N0, M0, k_split);
+    }
+
+    template <typename TopIdx>
+    __device__ constexpr auto CalculateBottomIndex(const TopIdx&) const
+    {
+        return make_tuple(blockIdx.z, blockIdx.y, blockIdx.x);
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
+                                             const CTileDim& /* c_tile_dim */) const
+    {
+        return true; // always valid provided that user gets grid size from CalculateGridSize()
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
+    {
+        return true;
+    }
+};
+
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -15,63 +15,31 @@
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+
 namespace ck {

 template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename CBlockClusterAdaptor,
-          bool HasMainKBlockLoop>
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename Block2CTileMap>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4r2(const FloatAB* __restrict__ p_a_grid,
-                                  const FloatAB* __restrict__ p_b_grid,
-                                  FloatC* __restrict__ p_c_grid,
-                                  const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-                                  const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
-                                  const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                  const AElementwiseOperation a_element_op,
-                                  const BElementwiseOperation b_element_op,
-                                  const CElementwiseOperation c_element_op,
-                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
+                                             const Block2CTileMap& b2c_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  static_cast<void*>(p_shared_block),
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  c_block_cluster_adaptor);
+    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+
+    __shared__ uint8_t p_shared[shared_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+        karg, static_cast<void*>(p_shared), b2c_map);
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = a_b_k0_m_k1_grid_desc;
-    ignore = b_b_k0_n_k1_grid_desc;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = c_block_cluster_adaptor;
+    ignore = karg;
+    ignore = b2c_map;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

@@ -79,13 +47,13 @@ template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CMNGridDesc,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
          index_t MPerBlock,
          index_t NPerBlock,
          index_t K0PerBlock,
@@ -126,10 +94,238 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
    static constexpr auto I7 = Number<7>{};

    // K1 should be Number<...>
-    static constexpr auto K1 = Number<K1Value>{};
+    static constexpr auto K1  = Number<K1Value>{};
+    static constexpr auto M01 = 1;
+    static constexpr auto N01 = 1;

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;

+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KPadded;
+        index_t K0;
+        index_t k_batch;
+
+        Argument(const FloatAB* p_a_grid_,
+                 const FloatAB* p_b_grid_,
+                 FloatC* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
+                 index_t MPadded_,
+                 index_t NPadded_,
+                 index_t KPadded_,
+                 index_t K0_,
+                 index_t k_batch_)
+            : p_a_grid(p_a_grid_),
+              p_b_grid(p_b_grid_),
+              p_c_grid(p_c_grid_),
+              M(M_),
+              N(N_),
+              K(K_),
+              StrideA(StrideA_),
+              StrideB(StrideB_),
+              StrideC(StrideC_),
+              MPadded(MPadded_),
+              NPadded(NPadded_),
+              KPadded(KPadded_),
+              K0(K0_),
+              k_batch(k_batch_)
+        {
+        }
+
+        void Print() const
+        {
+            std::cout << "arg {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", "
+                      << "K0:" << K0 << ", "
+                      << "KB:" << k_batch << "}" << std::endl;
+        }
+    };
+
+    __host__ __device__ static auto CalculateGridSize(const Argument& karg)
+    {
+        return std::make_tuple(math::integer_divide_ceil(karg.N, NPerBlock),
+                               math::integer_divide_ceil(karg.M, MPerBlock),
+                               karg.k_batch);
+    }
+
+    // prefer this to be called on host
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return (M + MPerBlock - 1) / MPerBlock * MPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return (N + NPerBlock - 1) / NPerBlock * NPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1)
+    {
+        // k_batch * k0 * k0_per_block * k1
+        auto K_t = K_Batch * K0PerBlock * K1;
+        return (K + K_t - 1) / K_t * K0PerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K0 = CalculateK0(K, K_Batch);
+        return K_Batch * K0 * K1;
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_KBatch_K0_M_K1(index_t M,
+                                                                       index_t MPad,
+                                                                       index_t K,
+                                                                       index_t StrideA,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_KBatch_K0_N_K1(index_t K,
+                                                                       index_t NPad,
+                                                                       index_t N,
+                                                                       index_t StrideB,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
+            b_grid_desc_k_n,
+            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t N, index_t MPad, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(c_grid_desc_m_n,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
        constexpr auto max_lds_align = K1;
@@ -178,45 +374,68 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                         c_block_size * sizeof(FloatC));
    }

-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
-    __host__ __device__ static constexpr bool
-    CheckValidity(const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                  const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                  const CMNGridDesc& c_m_n_grid_desc,
-                  const Block2CTileMap& block_2_ctile_map)
+    __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
    {
-        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
-                      "wrong! K1 need to be known at compile-time");
-
-        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
-                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
-                      "Invalid tuning param!");
-
-        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
-        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
-        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
-        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.M % MPerBlock == 0))
+                return false;
+        }
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.N % NPerBlock == 0))
+                return false;
+        }

-        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
-             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
-             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
-             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
-             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }

-        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }

-        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
        {
-            return false;
+            if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
        }

-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
        return true;
    }

+    __host__ __device__ static auto GetKPad(index_t K, index_t KBatch)
+    {
+        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
+        const index_t KPad = KBatch * K0 * K1;
+        return KPad;
+    }
+
    __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
    {
        const bool has_main_k0_block_loop = K0 > K0PerBlock;
@@ -224,8 +443,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        return has_main_k0_block_loop;
    }

+    template <typename CGridDesc>
    __host__ __device__ static constexpr auto
-    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc& c_m_n_grid_desc)
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc& c_m_n_grid_desc)
    {
        const auto M = c_m_n_grid_desc.GetLength(I0);
        const auto N = c_m_n_grid_desc.GetLength(I1);
@@ -242,10 +462,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
+    template <typename CGridDesc>
    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
+        const CGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
    {
-        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc>(
            c_m_n_grid_desc, 8, KBatch);
    }

@@ -262,24 +483,38 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
    }

-    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
-    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
+    // return block_id to C matrix tile idx (m0, n0, k_split) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap()
+    {
+        return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
+    }
+
+    using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1, 1, 1))>;
+    using DefaultBlock2CTileMap = remove_cvref_t<decltype(MakeDefaultBlock2CTileMap())>;

-    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
-                               const FloatAB* __restrict__ p_b_grid,
-                               FloatC* __restrict__ p_c_grid,
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              typename Block2CTileMap>
+    __device__ static void Run(const Argument& karg,
                               void* __restrict__ p_shared_block,
-                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const AElementwiseOperation& a_element_op,
-                               const BElementwiseOperation& b_element_op,
-                               const CElementwiseOperation& c_element_op,
-                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+                               const Block2CTileMap& block_2_ctile_map)
    {
+        const FloatAB* p_a_grid          = karg.p_a_grid;
+        const FloatAB* p_b_grid          = karg.p_b_grid;
+        FloatC* p_c_grid                 = karg.p_c_grid;
+        const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
+        const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
+            karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
+        const auto c_grid_desc_m_n =
+            MakeCGridDescriptor_M_N(karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
+        const AElementwiseOperation a_element_op = AElementwiseOperation{};
+        const BElementwiseOperation b_element_op = BElementwiseOperation{};
+        const CElementwiseOperation c_element_op = CElementwiseOperation{};
+
        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -287,28 +522,28 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());

-        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
-
-        // divide block work by [M, N]
+        // divide block work by [KBatch, M, N]
        const auto block_work_idx =
-            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));

-        if(!c_block_cluster_adaptor.ValidCTileIndex(
-               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
        {
            return;
        }

-        const index_t k_batch_id = block_work_idx[I0];
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I2]);
+        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);

        // HACK: this force m/n_block_data_idx_on_grid into SGPR
        const index_t m_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);

        const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);

        // lds max alignment
        constexpr auto max_lds_align = K1;
@@ -444,7 +679,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-
+#if 1
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                FloatAB,
@@ -456,6 +691,20 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                                                                MRepeat,
                                                                NRepeat,
                                                                K1>{};
+#else
+        auto blockwise_gemm = BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            decltype(a_k0_m_k1_block_desc),
+            decltype(b_k0_n_k1_block_desc),
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            K1>{};
+
+#endif

        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();

@@ -474,6 +723,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());

+#if 0
        // preload data into LDS
        {
            a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
@@ -519,6 +769,31 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2

            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }
+#else
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVersion::v2, 1, LoopScheduler::Default>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_b_k0_m_k1_grid_desc.GetLength(I1) * a_b_k0_m_k1_grid_desc.GetLength(I3)) /
+            (K0PerBlock * K1));
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_b_k0_m_k1_grid_desc,
+                                                               a_b_k0_m_k1_block_desc,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_b_k0_n_k1_grid_desc,
+                                                               b_b_k0_n_k1_block_desc,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+#endif

        // output: register to global memory
        {
@@ -647,7 +922,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                {c_block_desc_mblock_mperblock_nblock_nperblock,
                 make_multi_index(0, 0, 0, 0),
                 c_grid_desc_mblock_mperblock_nblock_nperblock,
-                 make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
                 c_element_op};

            constexpr auto mxdlperwave_forward_step =
@@ -716,6 +991,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            });
        }
    }
+
+    template <typename Layout>
+    struct LStr
+    {
+        static std::string Get() { return ""; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::RowMajor>
+    {
+        static std::string Get() { return "R"; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::ColumnMajor>
+    {
+        static std::string Get() { return "C"; }
+    };
+
+    static std::string GetTypeString()
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "GemmXdlSplitKCShuffle_"
+            << getGemmSpecializationString(GemmSpec) << "_"
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << "_"
+            << "B" << BlockSize << "_"
+            << "Vec" << ABlockTransferSrcScalarPerVector << "x"
+            << BBlockTransferSrcScalarPerVector << "x"
+            << CBlockTransferScalarPerVector_NWaveNPerXDL << "_"
+            << MPerBlock << "x"
+            << NPerBlock << "x"
+            << K0PerBlock << "x"
+            << K1 ;
+        // clang-format on
+
+        return str.str();
+    }
 };

 } // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -117,20 +117,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);

-void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
@@ -159,20 +145,21 @@ void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);

-void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
+                                                              NHWGK,
+                                                              BF16,
+                                                              BF16,
                                                              Empty_Tuple,
-                                                              int8_t,
+                                                              BF16,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
-// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              NHWGC,
@@ -187,6 +174,20 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);

+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -385,12 +386,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            {
                add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
-            {
-                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
-            }
        }
        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
@@ -398,7 +393,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
-                // no instance
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
            }
            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                              is_same_v<OutDataType, half_t>)
@@ -409,12 +404,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<WeiDataType, ck::bhalf_t> &&
                              is_same_v<OutDataType, ck::bhalf_t>)
            {
-                // no instance
-            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
-            {
-                // no instance
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
            }
        }
        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -68,6 +68,58 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);

+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 template <typename ALayout,
          typename BLayout,
          typename ELayout,
@@ -109,11 +161,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         is_same_v<ELayout, Row>)
            {
                add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
            {
                add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                              is_same_v<ELayout, Row>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -17,14 +17,14 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -36,10 +36,10 @@ void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(

 void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_F32_Tuple,
@@ -52,10 +52,10 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
 void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -68,10 +68,10 @@ void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -83,10 +83,10 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(

 void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_F32_Tuple,
@@ -99,10 +99,10 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -154,9 +154,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
@@ -220,9 +220,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -17,14 +17,14 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_Tuple,
@@ -36,10 +36,10 @@ void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(

 void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -51,10 +51,10 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(

 void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -67,10 +67,10 @@ void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_Tuple,
@@ -82,10 +82,10 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(

 void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -97,10 +97,10 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(

 void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -152,9 +152,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
@@ -218,9 +218,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
@@ -17,13 +17,13 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              F32_Tuple,
@@ -35,10 +35,10 @@ void add_device_conv2d_dl_perchannel_quantization_int8_instances(

 void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              F32_Tuple,
@@ -50,10 +50,10 @@ void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(

 void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              F32_Tuple,
@@ -65,10 +65,10 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances(

 void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              F32_Tuple,
@@ -119,9 +119,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<OutDataType, int8_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
@@ -17,13 +17,13 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              Empty_Tuple,
@@ -35,10 +35,10 @@ void add_device_conv2d_dl_perlayer_quantization_int8_instances(

 void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              Empty_Tuple,
@@ -50,10 +50,10 @@ void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(

 void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              Empty_Tuple,
@@ -65,10 +65,10 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances(

 void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              Empty_Tuple,
@@ -117,8 +117,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
-                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<OutDataType, int8_t>)

--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -26,7 +26,8 @@ using S = ck::Sequence<Is...>;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;

 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
@@ -35,14 +36,22 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
-        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 48, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   192,    64,     4,  8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   192,     4,  8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 24, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   192,    32,     4,  8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,    64,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    64,    32,     4,  8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>
    // clang-format on
    >;


--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -3,11 +3,11 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
   # NHWGC, GKYXC, NHWGK
+   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
-   #dl 
+   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   #dl
   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
 )
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "device_grouped_conv2d_fwd_dl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using AccDataType = float;
-using OutDataType = ck::half_t;
-
-using Empty_Tuple = ck::Tuple<>;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-using InLayout  = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout = ck::tensor_layout::convolution::GKYXC;
-using OutLayout = ck::tensor_layout::convolution::GNHWK;
-
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto Filter1x1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-static constexpr auto Filter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Pad0_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Stride1Pad0_instances =
-    std::tuple<
-        // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              InLayout,
-                                                              WeiLayout,
+                                                              GNHWC,
+                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              OutLayout,
-                                                              InDataType,
-                                                              WeiDataType,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
                                                              Empty_Tuple,
-                                                              OutDataType,
-                                                              InElementOp,
-                                                              WeiElementOp,
-                                                              OutElementOp>>>& instances)
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances{});
+                                   device_grouped_conv2d_fwd_dl_f16_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdDefault>{});

-    add_device_operation_instances(
-        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f16_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1P0>{});

-    add_device_operation_instances(
-        instances,
-        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Stride1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f16_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1S1P0>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "device_grouped_conv2d_fwd_dl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using InDataType  = float;
-using WeiDataType = float;
-using AccDataType = float;
-using OutDataType = float;
-
-using Empty_Tuple = ck::Tuple<>;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-using InLayout  = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout = ck::tensor_layout::convolution::GKYXC;
-using OutLayout = ck::tensor_layout::convolution::GNHWK;
-
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto Filter1x1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-static constexpr auto Filter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances = std::tuple<
-    // clang-format off
-           // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Pad0_instances = std::tuple<
-    // clang-format off
-           // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Stride1Pad0_instances =
-    std::tuple<
-        // clang-format off
-           // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              InLayout,
-                                                              WeiLayout,
+                                                              GNHWC,
+                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              OutLayout,
-                                                              InDataType,
-                                                              WeiDataType,
+                                                              GNHWK,
+                                                              F32,
+                                                              F32,
                                                              Empty_Tuple,
-                                                              OutDataType,
-                                                              InElementOp,
-                                                              WeiElementOp,
-                                                              OutElementOp>>>& instances)
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances{});
+                                   device_grouped_conv2d_fwd_dl_f32_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdDefault>{});

-    add_device_operation_instances(
-        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f32_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1P0>{});

-    add_device_operation_instances(
-        instances,
-        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Stride1Pad0_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_f32_instances<GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwd1x1S1P0>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using InDataType  = int8_t;
-using WeiDataType = int8_t;
-using AccDataType = int32_t;
-using OutDataType = int8_t;
-
-using Empty_Tuple = ck::Tuple<>;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-using InLayout  = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout = ck::tensor_layout::convolution::GKYXC;
-using OutLayout = ck::tensor_layout::convolution::GNHWK;
-
-static constexpr auto ConvSpec =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto Filter1x1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-static constexpr auto Filter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Pad0_instances = std::tuple<
-    // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-    // clang-format on
-    >;
-
-using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Stride1Pad0_instances =
-    std::tuple<
-        // clang-format off
-           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
-        // clang-format on
-        >;
-
-void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              InLayout,
-                                                              WeiLayout,
-                                                              Empty_Tuple,
-                                                              OutLayout,
-                                                              InDataType,
-                                                              WeiDataType,
-                                                              Empty_Tuple,
-                                                              OutDataType,
-                                                              InElementOp,
-                                                              WeiElementOp,
-                                                              OutElementOp>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances{});
-
-    add_device_operation_instances(
-        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Pad0_instances{});
-
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Stride1Pad0_instances{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "device_grouped_conv2d_fwd_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename DsDatatype,
+          typename CDEElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv2d_fwd_dl_f16_instances = std::tuple<
+    // clang-format off
+           // ########################################|        NDim| InData| WeiData|    MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout|          In|          Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|   Type|    Type|        Type|    Type|    Type|         |          |    Layout|          | Elementwise|  Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |       |        |            |        |        |         |          |          |          |   Operation|    Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |       |        |            |        |        |         |          |          |          |            |             |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2,    F16,     F16,  DsDatatype,     F16,     F32, InLayout, WeiLayout,  DsLayout, OutLayout, PassThrough,  PassThrough,  CDEElementOp,       ConvSpec,    GemmMNKPadding,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+template <typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename DsDatatype,
+          typename CDEElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv2d_fwd_dl_f32_instances = std::tuple<
+    // clang-format off
+        // clang-format off
+           // ########################################|        NDim| InData| WeiData|    MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout|          In|          Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|   Type|    Type|        Type|    Type|    Type|         |          |    Layout|          | Elementwise|  Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |       |        |            |        |        |         |          |          |          |   Operation|    Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |       |        |            |        |        |         |          |          |          |            |             |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2,    F32,     F32,  DsDatatype,     F32,     F32, InLayout, WeiLayout,  DsLayout, OutLayout, PassThrough, PassThrough,  CDEElementOp,       ConvSpec,    GemmMNKPadding,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using BF16 = ck::bhalf_t;
-using F32  = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto ConvFwdOddC =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-using device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances =
-    std::tuple<
-        // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Stride1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // OddC
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
@@ -146,8 +23,41 @@ void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances, device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_bf16_instances<GNHWC,
+                                                                                GKYXC,
+                                                                                Empty_Tuple,
+                                                                                GNHWK,
+                                                                                Empty_Tuple,
+                                                                                PassThrough,
+                                                                                ConvFwdOddC>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto ConvFwdOddC =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for in[g, n, hi ,wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances =
-    std::tuple<
-        // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // Filter1x1Stride1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-
-        // OddC
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
@@ -147,7 +24,40 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                              PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances{});
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f16_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdOddC>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "device_grouped_conv2d_fwd_xdl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-
-using F32 = float;
-
-using Empty_Tuple = ck::Tuple<>;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-static constexpr auto ConvFwd1x1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
-
-static constexpr auto ConvFwd1x1S1P0 =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
-
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances =
-    std::tuple<
-        // clang-format off
-        // Default
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-
-        // Filter1x1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-
-        // Filter1x1Stride1Pad0
-        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
@@ -119,7 +24,40 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                              PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances{});
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_f32_instances<GNHWC,
+                                                                               GKYXC,
+                                                                               Empty_Tuple,
+                                                                               GNHWK,
+                                                                               Empty_Tuple,
+                                                                               PassThrough,
+                                                                               ConvFwdOddC>{});
 }

 } // namespace instance