Merge branch 'develop' into amd-develop

c5ad2e80 · Jun Liu · 4b798833 · 489c78d0 · c5ad2e80 · c5ad2e80
Commit c5ad2e80 authored Nov 12, 2024 by Jun Liu
20 changed files
--- a/include/ck_tile/ops/rmsnorm2d.hpp
+++ b/include/ck_tile/ops/rmsnorm2d.hpp
@@ -4,7 +4,6 @@
 #pragma once

 #include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp"
-#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp"

--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
@@ -11,11 +11,11 @@ namespace ck_tile {
 // host side args
 struct Rmsnorm2dFwdHostArgs
 {
-    const void* p_x;
-    const void* p_gamma;
+    const void* p_x;     // [m ,n], input, fp16/bf16
+    const void* p_gamma; // [1, n], gamma, prec same as input

-    void* p_y;
-    void* p_invRms;
+    void* p_y;      // [m, n], output, fp16/bf16
+    void* p_invRms; // [m, 1], output inv-rms, prec same as input, nullptr if not used

    float epsilon;

@@ -83,7 +83,7 @@ struct Rmsnorm2dFwd

    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
    {
-        return (hargs.m + Block_M - 1) / Block_M;
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
    }

    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
@@ -149,7 +149,7 @@ struct Rmsnorm2dFwd
                number<1>{});

            const auto tmp2_ =
-                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadM>{});
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});

            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
        }();

--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-
-namespace ck_tile {
-/*
-// clang-format off
-
-4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
-
-                         Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
-        +<----------------------< Repeat_N(2)>--------------------->+
-        |                                                           |
-        +<--    <WarpPerBlock_N(2)>  -->+
-            Warp_N
-        +--------------+--------------+--------------+--------------+----+----------------+
- Warp_M | wrap_0       | wrap_1       |                             |    ^                ^
-        +--------------+--------------+                             |   <WarpPerBlock_M(2)> |
-        | wrap_2       | wrap_3       |                             |    v
-        +--------------+--------------+--------------+--------------+----+           Block_M
-        |                             |                             |
-        +                             +                             |
-        |                             |                             |                     v
-        +--------------+--------------+--------------+--------------+                     +
-
-        each Warp-tile (e.g 16 thrd per row)
-
-         Vector_N (contiguous pixels each thrd holds along N, or vector size)
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_0    | thrd_1    | thrd_2    | thrd_3    | ...         Vector_M
-        +-----------+-----------+-----------+-----------+-----------+
-        | thrd_16   | thrd_17   | thrd_18   | thrd_19   | ...
-        +-----------+-----------+-----------+-----------+-----------+
-// clang-format on
-*/
-template <typename BlockTile_,    // block size, seq<M, N>
-          typename WarpPerBlock_, // num warps along seq<M, N>
-          typename WarpTile_,     // warp size, seq<M, N>
-          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
-          index_t BlockSize_ =
-              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
-struct Rmsnorm2dShape
-{
-    // block size
-    static constexpr index_t Block_M = BlockTile_::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile_::at(number<1>{});
-
-    // num warps along seq<M, N>, within each block
-    static constexpr index_t WarpPerBlock_M = WarpPerBlock_::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = WarpPerBlock_::at(number<1>{});
-
-    // warp size
-    static constexpr index_t Warp_M = WarpTile_::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile_::at(number<1>{});
-
-    static_assert(Block_M % (WarpPerBlock_M * Warp_M) == 0);
-    static_assert(Block_N % (WarpPerBlock_N * Warp_N) == 0);
-    // repeat of each thread along seq<M, N>
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    // vector size along seq<M, N>
-    static constexpr index_t Vector_M = Vector_::at(number<0>{});
-    static constexpr index_t Vector_N = Vector_::at(number<1>{});
-
-    static_assert(Warp_M % Vector_M == 0);
-    static_assert(Warp_N % Vector_N == 0);
-    // num of threads along seq<M, N>, within each warp
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t BlockSize = BlockSize_;
-};
-
-} // namespace ck_tile
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
@@ -26,6 +26,7 @@ struct Rmsnorm2dFwdPipelineDefaultPolicy
                sequence<1, 1, 2, 2>,
                sequence<0, 3, 0, 3>>{});
    }
+
    template <typename Problem>
    CK_TILE_DEVICE static constexpr auto MakeGammaBlockTileDistribution()
    {

--- a/include/ck_tile/ops/smoothquant.hpp
+++ b/include/ck_tile/ops/smoothquant.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp"
+#include "ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+
+namespace ck_tile {
+
+// host side args
+struct SmoothquantHostArgs
+{
+    const void* p_x;      // [m ,n], input, fp16/bf16
+    const void* p_xscale; // [1, n], input, columnwise scale, fp32
+
+    void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of (p_x * p_xscale)
+    void* p_qy;     // [m, n], output, p_x * p_xscale / p_yscale
+
+    index_t m;
+    index_t n;
+    index_t stride; // row_stride
+};
+
+// TODO: Extract some type to wrapper class
+template <typename Pipeline_>
+struct Smoothquant
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = typename Pipeline::Problem;
+
+    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using YScaleDataType  = remove_cvref_t<typename Problem::YScaleDataType>;
+    using QYDataType      = remove_cvref_t<typename Problem::QYDataType>;
+
+    static constexpr index_t Block_M = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM      = false; // always no need to pad along M
+    static constexpr bool kPadN      = Problem::kPadN;
+    static constexpr bool kTwoPass   = Problem::kTwoPass;
+
+    static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
+    static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
+    static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    struct Kargs
+    {
+        const void* p_x;
+        const void* p_xscale;
+
+        void* p_yscale;
+        void* p_qy;
+
+        index_t m;
+        index_t n;
+        index_t stride; // row_stride
+    };
+    using Hargs = SmoothquantHostArgs;
+
+    CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
+    {
+        return Kargs{
+            hargs.p_x, hargs.p_xscale, hargs.p_yscale, hargs.p_qy, hargs.m, hargs.n, hargs.stride};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& hargs)
+    {
+        return dim3(integer_divide_ceil(hargs.m, Block_M));
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::BlockSize; }
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Pipeline::GetSmemSize(); }
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        using S_ = typename Problem::BlockShape;
+        auto surfix = [&] () {
+            std::string n;
+            if (kPadN) n += "_pn";
+            if (kTwoPass) n += "_2p";
+            return n; }();
+
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        return _SS_("smoothquant_fwd_") + _SS_(t2s<XDataType>::name) + "_" +
+             _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" +
+             _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" +
+             _SS_(Pipeline::name) + surfix;
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const auto iM = get_block_id() * Block_M;
+
+        const auto x_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XDataType*>(kargs.p_x),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        const auto xscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XScaleDataType*>(kargs.p_xscale),
+                make_tuple(kargs.n),
+                make_tuple(1),
+                number<Vector_N>{},
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<kPadN>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+        }();
+
+        auto yscale_window = [&]() {
+            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YScaleDataType*>(kargs.p_yscale),
+                make_tuple(kargs.m),
+                make_tuple(1),
+                number<1>{});
+
+            const auto tmp2_ =
+                pad_tensor_view(tmp_, make_tuple(number<Block_M>{}), sequence<kPadM>{});
+
+            return make_tile_window(tmp2_, make_tuple(number<Block_M>{}), {iM});
+        }();
+
+        auto qy_window = [&]() {
+            auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<QYDataType*>(kargs.p_qy),
+                make_tuple(kargs.m, kargs.n),
+                make_tuple(kargs.stride, 1),
+                number<Vector_N>{},
+                number<1>{});
+
+            auto tmp2_ = pad_tensor_view(
+                tmp_, make_tuple(number<Block_M>{}, number<Block_N>{}), sequence<kPadM, kPadN>{});
+            return make_tile_window(
+                tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
+        }();
+
+        __shared__ char smem[GetSmemSize()];
+
+        Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.n, smem);
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_default_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+
+namespace ck_tile {
+
+struct SmoothquantPipelineDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXScaleBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<S::WarpPerBlock_M, S::ThreadPerWarp_M>,
+                tuple<sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<0, 1>, sequence<0, 1>>,
+                tuple<sequence<0, 1>, sequence<1, 2>>,
+                sequence<1, 1>,
+                sequence<0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockReduce2dProblem<typename Problem::XDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::BlockShape>;
+
+            using block_reduce2d = BlockReduce2d<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::XDataType>(
+                    MakeXBlockTileDistribution<Problem>()));
+            using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile<x_block_tile>());
+
+            return GetBlockReduce2dCrossWarpSync<Problem>().template GetSmemSize<y_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = SmoothquantPipelineDefaultPolicy>
+struct SmoothquantPipelineOnePass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_op"; // block per row
+        else
+            return "wpr_op"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XScaleWindow& xscale_window_,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ck_tile::index_t,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto xscale_window = make_tile_window(
+            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
+
+        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        const auto x      = load_tile(x_window);
+        const auto xscale = load_tile(xscale_window);
+        auto y            = tile_elementwise_in(
+            [&](const auto& a, const auto& b) {
+                return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+            },
+            x,
+            xscale);
+
+        // compute absmax, cross-lane->cross-warp
+        auto absmax = block_reduce2d(
+            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+
+        // quantize y to qy
+        auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+        sweep_tile(qy, [&](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            auto qy_             = y[idx] / yscale[i_idx];
+            qy(idx)              = saturates<QYDataType>{}(qy_);
+        });
+        store_tile(qy_window, qy);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+// Y = X * XScale, QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
+template <typename XDataType_,
+          typename XScaleDataType_,
+          typename ComputeDataType_,
+          typename YScaleDataType_,
+          typename QYDataType_,
+          typename BlockShape_,
+          bool kPadN_,
+          bool kTwoPass_>
+struct SmoothquantPipelineProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using XScaleDataType  = remove_cvref_t<XScaleDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YScaleDataType  = remove_cvref_t<YScaleDataType_>;
+    using QYDataType      = remove_cvref_t<QYDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+
+    static constexpr bool kPadN    = kPadN_;
+    static constexpr bool kTwoPass = kTwoPass_;
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = SmoothquantPipelineDefaultPolicy>
+struct SmoothquantPipelineTwoPass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using XScaleDataType  = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using QYDataType      = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
+    using YScaleDataType  = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
+    static constexpr bool kPadN              = Problem::kPadN;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_tp"; // block per row
+        else
+            return "wpr_tp"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XScaleWindow& xscale_window_,
+                                   YScaleWindow& yscale_window,
+                                   QYWindow& qy_window,
+                                   ck_tile::index_t row_size,
+                                   void* smem) const
+    {
+        auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto xscale_window = make_tile_window(
+            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
+
+        static constexpr index_t Block_N = Problem::BlockShape::Block_N;
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+
+        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_max_func     = ReduceOp::Max{};
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(x_window)));
+        auto absmax       = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x      = load_tile(x_window);
+            const auto xscale = load_tile(xscale_window);
+            const auto y      = tile_elementwise_in(
+                [&](const auto& a, const auto& b) {
+                    return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+                },
+                x,
+                xscale);
+
+            block_reduce2d(y, absmax, reduce_absmax_func);
+
+            move_tile_window(x_window, {0, Block_N});
+            move_tile_window(xscale_window, {Block_N});
+        }
+
+        // compute absmax, cross-lane->cross-warp
+        block_reduce2d_sync(absmax, reduce_max_func);
+        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);
+
+        // ex: yscale = absmax / 127 if int8
+        auto yscale = tile_elementwise_in(
+            [&](const auto& v_) {
+                return v_ / type_convert<ComputeDataType>(numeric<QYDataType>::max());
+            },
+            absmax);
+        store_tile(yscale_window, cast_tile<YScaleDataType>(yscale));
+
+        // reverse read x to reuse cache
+        ck_tile::index_t stride_to_right_most_window =
+            row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
+
+        move_tile_window(x_window, {0, -Block_N});
+        move_tile_window(xscale_window, {-Block_N});
+        move_tile_window(qy_window, {0, stride_to_right_most_window});
+
+        // recompute y and quantize y to qy
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x      = load_tile(x_window);
+            const auto xscale = load_tile(xscale_window);
+            const auto y      = tile_elementwise_in(
+                [&](const auto& a, const auto& b) {
+                    return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
+                },
+                x,
+                xscale);
+
+            auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
+            sweep_tile(qy, [&](auto idx) {
+                constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+                auto qy_             = y[idx] / yscale[i_idx];
+                qy(idx)              = saturates<QYDataType>{}(qy_);
+            });
+            store_tile(qy_window, qy);
+
+            move_tile_window(x_window, {0, -Block_N});
+            move_tile_window(xscale_window, {0, -Block_N});
+            move_tile_window(qy_window, {0, -Block_N});
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
@@ -11,9 +11,10 @@ namespace ck_tile {
 template <typename Problem_, typename Policy_ = void>
 struct BlockWelford
 {
-    using Problem         = remove_cvref_t<Problem_>;
-    using XDataType       = typename Problem::XDataType;
-    using ComputeDataType = typename Problem::ComputeDataType;
+    using Problem                   = remove_cvref_t<Problem_>;
+    using XDataType                 = typename Problem::XDataType;
+    using ComputeDataType           = typename Problem::ComputeDataType;
+    static constexpr bool kFastFDiv = Problem::kFastFDiv;

    CK_TILE_DEVICE constexpr BlockWelford() {}

@@ -89,7 +90,8 @@ struct BlockWelford
 template <typename Problem_, typename Policy_ = void>
 struct BlockWelfordSync
 {
-    using Problem = remove_cvref_t<Problem_>;
+    using Problem                   = remove_cvref_t<Problem_>;
+    static constexpr bool kFastFDiv = Problem::kFastFDiv;

    template <typename MeanDistributedTensor_, typename VarDistributedTensor_>
    CK_TILE_DEVICE void
@@ -173,8 +175,9 @@ struct BlockWelfordSync
 template <typename Problem_, typename Policy_ = void>
 struct BlockWelfordCrossWarpSync
 {
-    using Problem    = remove_cvref_t<Problem_>;
-    using BlockShape = typename Problem::BlockShape;
+    using Problem                   = remove_cvref_t<Problem_>;
+    using BlockShape                = typename Problem::BlockShape;
+    static constexpr bool kFastFDiv = Problem::kFastFDiv;

    template <typename MeanDistributedTensor_>
    CK_TILE_DEVICE static constexpr index_t GetReduceWarps()
@@ -351,12 +354,23 @@ CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_
 }

 // Note: this function must be called after all the computation
-template <typename VarDistributedTensor_>
+template <typename VarDistributedTensor_, bool FastFdiv_ = false>
 CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTensor_& var_tensor,
-                                                                int count)
+                                                                int count,
+                                                                bool_constant<FastFdiv_> = {})
 {
    using DataType = typename VarDistributedTensor_::DataType;
-    tile_elementwise_inout([&count](auto& x) { x = x / type_convert<DataType>(count); },
-                           var_tensor);
+    tile_elementwise_inout(
+        [&count](auto& x) {
+            if(FastFdiv_ && std::is_same_v<DataType, float>)
+            {
+                x = x * __builtin_amdgcn_rcpf(type_convert<DataType>(count));
+            }
+            else
+            {
+                x = x / type_convert<DataType>(count);
+            }
+        },
+        var_tensor);
 }
 } // namespace ck_tile
--- a/include/ck_tile/ops/welford/block/block_welford_problem.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford_problem.hpp
@@ -7,12 +7,13 @@

 namespace ck_tile {

-template <typename XDataType_, typename ComputeDataType_, typename BlockShape_>
+template <typename XDataType_, typename ComputeDataType_, typename BlockShape_, bool kFastFDiv_>
 struct BlockWelfordProblem
 {
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using XDataType                 = remove_cvref_t<XDataType_>;
+    using ComputeDataType           = remove_cvref_t<ComputeDataType_>;
+    using BlockShape                = remove_cvref_t<BlockShape_>;
+    static constexpr bool kFastFDiv = kFastFDiv_;
 };

 } // namespace ck_tile
--- a/include/ck_tile/ops/welford/thread/thread_welford.hpp
+++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp
@@ -7,25 +7,46 @@

 namespace ck_tile {

-template <typename T>
-CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count)
+template <typename T, bool kFastFDiv = false>
+CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count, bool_constant<kFastFDiv> = {})
 {
    // TODO: check nan? maybe no
    T delta = x - mean;
-    mean += delta / count;
+    if(kFastFDiv && std::is_same_v<T, float>)
+    {
+        mean += delta * __builtin_amdgcn_rcpf(count);
+    }
+    else
+    {
+        mean += delta / count;
+    }
    T delta2 = x - mean;
    var += delta * delta2;
 }

-template <typename T>
-CK_TILE_DEVICE static void
-welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+template <typename T, bool kFastFDiv = false>
+CK_TILE_DEVICE static void welford_merge(T& mean_a,
+                                         T& var_a,
+                                         int& count_a,
+                                         T mean_b,
+                                         T var_b,
+                                         int count_b,
+                                         bool_constant<kFastFDiv> = {})
 {
-    int count            = count_a + count_b;
-    T count_             = type_convert<T>(count);
-    T count_a_           = type_convert<T>(count_a);
-    T count_b_           = type_convert<T>(count_b);
-    T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
+    int count  = count_a + count_b;
+    T count_   = type_convert<T>(count);
+    T count_a_ = type_convert<T>(count_a);
+    T count_b_ = type_convert<T>(count_b);
+    T count_b_over_count;
+    if(kFastFDiv && std::is_same_v<T, float>)
+    {
+        count_b_over_count =
+            count == 0 ? type_convert<T>(0) : count_b_ * __builtin_amdgcn_rcpf(count_);
+    }
+    else
+    {
+        count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
+    }

    T delta = mean_b - mean_a;
    mean_a += delta * count_b_over_count;

--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
+from datetime import datetime
 import pathlib
 from pathlib import Path
 import subprocess
@@ -8,8 +9,8 @@ NS = 'ck_tile'
 OPS = 'ops'
 OPS_COMMON = 'common' # common header will be duplicated into ops/* other module

-HEADER_COMMON = """// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-{datetime.now().year}, Advanced Micro Devices, Inc. All rights reserved.\n
 """

 # aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -39,7 +39,25 @@ template <ck::index_t NDimSpatial,
          ConvolutionBackwardWeightSpecialization ConvSpec,
          BlockGemmPipelineScheduler Scheduler,
          BlockGemmPipelineVersion PipelineVersion>
-using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances = std::tuple<
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
+        // clang-format on
+        >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances = std::tuple<
    // clang-format off
        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
@@ -64,7 +82,25 @@ template <ck::index_t NDimSpatial,
          ConvolutionBackwardWeightSpecialization ConvSpec,
          BlockGemmPipelineScheduler Scheduler,
          BlockGemmPipelineVersion PipelineVersion>
-using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = std::tuple<
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
+        // clang-format on
+        >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances = std::tuple<
    // clang-format off
        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
@@ -82,6 +118,24 @@ using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_bf16_instances = st
    // clang-format on
    >;

+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>
+        // clang-format on
+        >;
+
 // NGCHW requires transpose, we use vector loads and stores params for them
 template <ck::index_t NDimSpatial,
          typename ALayout,
@@ -122,6 +176,24 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances
    // clang-format on
    >;

+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>
+        // clang-format on
+        >;
+
 template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -131,6 +131,31 @@ using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple<
    // clang-format on
    >;

+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        // AGPR Spill when use permuted lds layout. so, use padding for these two.
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_dynamic_op_instance.hpp
@@ -53,8 +53,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
        // instances for small conv.K and conv.C
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
@@ -68,6 +68,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_bf16_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    Tuple<>,  BF16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        #endif
    // clang-format on
    >;

@@ -87,8 +88,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
        // instances for small conv.K and conv.C
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
@@ -102,6 +103,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f16_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,  Tuple<>,    F16, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        #endif
    // clang-format on
    >;

@@ -121,8 +123,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
        // instances for small conv.K and conv.C
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
@@ -136,6 +138,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_f32_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,  Tuple<>,    F32, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        #endif
    // clang-format on
    >;

@@ -155,8 +158,8 @@ using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
        // instances for small conv.K and conv.C
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        #if 0 // Enable with dynamic op optimizations (at now generating a lot of virtual functions cause long compilation time)
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
@@ -170,6 +173,7 @@ using device_grouped_conv_fwd_xdl_dynamic_op_int8_instances = std::tuple<
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,  Tuple<>,  int8_t, PassThrough, PassThrough,     DynamicUnaryOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        #endif
    // clang-format on
    >;


--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -87,6 +87,25 @@ using device_grouped_conv_fwd_xdl_large_tensor_f32_instances = std::tuple<
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
    // clang-format on
    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_large_tensor_int8_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -154,6 +154,43 @@ using device_grouped_conv_fwd_xdl_f32_mem_instances = std::tuple<
    // clang-format on
    >;

+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          BlockGemmPipelineScheduler BlkGemmPipeSched>
+using device_grouped_conv_fwd_xdl_int8_mem_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -90,6 +90,25 @@ using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple<
    // clang-format on
    >;

+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_merged_groups_int8_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // Instances with NumGroupsPerBatch > 1
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 32>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation