Merge branch 'develop' into normalization/splitK

0d6da13c · rocking · GitHub · 27f8c64b · b076a02a · 0d6da13c
Unverified Commit 0d6da13c authored May 05, 2023 by rocking Committed by GitHub May 05, 2023
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -54,7 +54,8 @@ __global__ void
            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -44,7 +44,8 @@ __global__ void
                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
                                    const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -57,7 +57,8 @@ __global__ void
            const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    // TODO ANT: separate into MMA + Epilogue

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -165,7 +165,8 @@ __global__ void
                                      const CElementwiseOperation c_element_op,
                                      const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -265,7 +266,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
    // when mfma if fixed, remove this section and update
    // FloatABAdjusted -> FloatAB throughout this file
-#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX
    using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
 #else
    using FloatABAdjusted = FloatAB;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -44,7 +44,8 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -43,7 +43,8 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -135,7 +136,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
    // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
    // when mfma if fixed, remove this section and update
    // FloatABAdjusted -> FloatAB throughout this file
-#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#if CK_WORKAROUND_DENORM_FIX
    using FloatABAdjusted = conditional_t<is_same_v<FloatAB, ck::half_t>, ck::bhalf_t, FloatAB>;
 #else
    using FloatABAdjusted = FloatAB;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -42,7 +42,8 @@ __global__ void
                                const CElementwiseOperation c_element_op,
                                const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -15,63 +15,32 @@
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+
 namespace ck {

 template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename CBlockClusterAdaptor,
-          bool HasMainKBlockLoop>
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename Block2CTileMap>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4r2(const FloatAB* __restrict__ p_a_grid,
-                                  const FloatAB* __restrict__ p_b_grid,
-                                  FloatC* __restrict__ p_c_grid,
-                                  const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-                                  const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
-                                  const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                  const AElementwiseOperation a_element_op,
-                                  const BElementwiseOperation b_element_op,
-                                  const CElementwiseOperation c_element_op,
-                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
+        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
+                                             const Block2CTileMap& b2c_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-
-    __shared__ FloatAB p_shared_block[shared_block_size];
-
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_c_grid,
-                                                  static_cast<void*>(p_shared_block),
-                                                  a_b_k0_m_k1_grid_desc,
-                                                  b_b_k0_n_k1_grid_desc,
-                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  c_block_cluster_adaptor);
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
+    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
+
+    __shared__ uint8_t p_shared[shared_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
+        karg, static_cast<void*>(p_shared), b2c_map);
 #else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_c_grid;
-    ignore = a_b_k0_m_k1_grid_desc;
-    ignore = b_b_k0_n_k1_grid_desc;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = c_element_op;
-    ignore = c_block_cluster_adaptor;
+    ignore = karg;
+    ignore = b2c_map;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

@@ -79,13 +48,13 @@ template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
-          typename CMNGridDesc,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
          index_t MPerBlock,
          index_t NPerBlock,
          index_t K0PerBlock,
@@ -126,10 +95,238 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
    static constexpr auto I7 = Number<7>{};

    // K1 should be Number<...>
-    static constexpr auto K1 = Number<K1Value>{};
+    static constexpr auto K1  = Number<K1Value>{};
+    static constexpr auto M01 = 1;
+    static constexpr auto N01 = 1;

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;

+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        const FloatAB* p_a_grid;
+        const FloatAB* p_b_grid;
+        FloatC* p_c_grid;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KPadded;
+        index_t K0;
+        index_t k_batch;
+
+        Argument(const FloatAB* p_a_grid_,
+                 const FloatAB* p_b_grid_,
+                 FloatC* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
+                 index_t MPadded_,
+                 index_t NPadded_,
+                 index_t KPadded_,
+                 index_t K0_,
+                 index_t k_batch_)
+            : p_a_grid(p_a_grid_),
+              p_b_grid(p_b_grid_),
+              p_c_grid(p_c_grid_),
+              M(M_),
+              N(N_),
+              K(K_),
+              StrideA(StrideA_),
+              StrideB(StrideB_),
+              StrideC(StrideC_),
+              MPadded(MPadded_),
+              NPadded(NPadded_),
+              KPadded(KPadded_),
+              K0(K0_),
+              k_batch(k_batch_)
+        {
+        }
+
+        void Print() const
+        {
+            std::cout << "arg {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", "
+                      << "K0:" << K0 << ", "
+                      << "KB:" << k_batch << "}" << std::endl;
+        }
+    };
+
+    __host__ __device__ static auto CalculateGridSize(const Argument& karg)
+    {
+        return std::make_tuple(math::integer_divide_ceil(karg.N, NPerBlock),
+                               math::integer_divide_ceil(karg.M, MPerBlock),
+                               karg.k_batch);
+    }
+
+    // prefer this to be called on host
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return (M + MPerBlock - 1) / MPerBlock * MPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return (N + NPerBlock - 1) / NPerBlock * NPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1)
+    {
+        // k_batch * k0 * k0_per_block * k1
+        auto K_t = K_Batch * K0PerBlock * K1;
+        return (K + K_t - 1) / K_t * K0PerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K0 = CalculateK0(K, K_Batch);
+        return K_Batch * K0 * K1;
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_KBatch_K0_M_K1(index_t M,
+                                                                       index_t MPad,
+                                                                       index_t K,
+                                                                       index_t StrideA,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_KBatch_K0_N_K1(index_t K,
+                                                                       index_t NPad,
+                                                                       index_t N,
+                                                                       index_t StrideB,
+                                                                       index_t KBatch,
+                                                                       index_t K0,
+                                                                       index_t KPad)
+    {
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
+            b_grid_desc_k_n,
+            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                     GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            // const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t N, index_t MPad, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        {
+            return transform_tensor_descriptor(c_grid_desc_m_n,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
        constexpr auto max_lds_align = K1;
@@ -178,45 +375,68 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                         c_block_size * sizeof(FloatC));
    }

-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    template <typename Block2CTileMap>
-    __host__ __device__ static constexpr bool
-    CheckValidity(const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                  const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                  const CMNGridDesc& c_m_n_grid_desc,
-                  const Block2CTileMap& block_2_ctile_map)
+    __host__ __device__ static constexpr bool CheckValidity(const Argument& karg)
    {
-        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
-                      "wrong! K1 need to be known at compile-time");
-
-        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
-                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
-                      "Invalid tuning param!");
-
-        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
-        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
-        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
-        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.M % MPerBlock == 0))
+                return false;
+        }
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            if(!(karg.N % NPerBlock == 0))
+                return false;
+        }

-        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
-             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
-             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
-             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
-             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+                return false;
+        }

-        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
-            return false;
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+                return false;
+        }

-        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
        {
-            return false;
+            if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
+        }
+        else
+        {
+            if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
+                return false;
        }

-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
        return true;
    }

+    __host__ __device__ static auto GetKPad(index_t K, index_t KBatch)
+    {
+        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
+        const index_t KPad = KBatch * K0 * K1;
+        return KPad;
+    }
+
    __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
    {
        const bool has_main_k0_block_loop = K0 > K0PerBlock;
@@ -224,8 +444,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        return has_main_k0_block_loop;
    }

+    template <typename CGridDesc>
    __host__ __device__ static constexpr auto
-    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc& c_m_n_grid_desc)
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc& c_m_n_grid_desc)
    {
        const auto M = c_m_n_grid_desc.GetLength(I0);
        const auto N = c_m_n_grid_desc.GetLength(I1);
@@ -242,10 +463,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
+    template <typename CGridDesc>
    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
+        const CGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
    {
-        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc>(
            c_m_n_grid_desc, 8, KBatch);
    }

@@ -262,24 +484,38 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
    }

-    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
-    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
+    // return block_id to C matrix tile idx (m0, n0, k_split) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap()
+    {
+        return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
+    }
+
+    using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1, 1, 1))>;
+    using DefaultBlock2CTileMap = remove_cvref_t<decltype(MakeDefaultBlock2CTileMap())>;

-    template <bool HasMainKBlockLoop>
-    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
-                               const FloatAB* __restrict__ p_b_grid,
-                               FloatC* __restrict__ p_c_grid,
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              typename Block2CTileMap>
+    __device__ static void Run(const Argument& karg,
                               void* __restrict__ p_shared_block,
-                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
-                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
-                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const AElementwiseOperation& a_element_op,
-                               const BElementwiseOperation& b_element_op,
-                               const CElementwiseOperation& c_element_op,
-                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+                               const Block2CTileMap& block_2_ctile_map)
    {
+        const FloatAB* p_a_grid          = karg.p_a_grid;
+        const FloatAB* p_b_grid          = karg.p_b_grid;
+        FloatC* p_c_grid                 = karg.p_c_grid;
+        const auto a_b_k0_m_k1_grid_desc = MakeAGridDescriptor_KBatch_K0_M_K1(
+            karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0, karg.KPadded);
+        const auto b_b_k0_n_k1_grid_desc = MakeBGridDescriptor_KBatch_K0_N_K1(
+            karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0, karg.KPadded);
+        const auto c_grid_desc_m_n =
+            MakeCGridDescriptor_M_N(karg.M, karg.N, karg.MPadded, karg.NPadded, karg.StrideC);
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
+        const AElementwiseOperation a_element_op = AElementwiseOperation{};
+        const BElementwiseOperation b_element_op = BElementwiseOperation{};
+        const CElementwiseOperation c_element_op = CElementwiseOperation{};
+
        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -287,28 +523,28 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());

-        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
-
-        // divide block work by [M, N]
+        // divide block work by [KBatch, M, N]
        const auto block_work_idx =
-            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));

-        if(!c_block_cluster_adaptor.ValidCTileIndex(
-               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
        {
            return;
        }

-        const index_t k_batch_id = block_work_idx[I0];
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I2]);
+        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);

        // HACK: this force m/n_block_data_idx_on_grid into SGPR
        const index_t m_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);

        const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);

        // lds max alignment
        constexpr auto max_lds_align = K1;
@@ -444,7 +680,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-
+#if 1
        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                FloatAB,
@@ -456,6 +692,20 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                                                                MRepeat,
                                                                NRepeat,
                                                                K1>{};
+#else
+        auto blockwise_gemm = BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            decltype(a_k0_m_k1_block_desc),
+            decltype(b_k0_n_k1_block_desc),
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            K1>{};
+
+#endif

        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();

@@ -474,6 +724,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());

+#if 0
        // preload data into LDS
        {
            a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
@@ -519,6 +770,31 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2

            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }
+#else
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVersion::v2, 1, LoopScheduler::Default>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_b_k0_m_k1_grid_desc.GetLength(I1) * a_b_k0_m_k1_grid_desc.GetLength(I3)) /
+            (K0PerBlock * K1));
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_b_k0_m_k1_grid_desc,
+                                                               a_b_k0_m_k1_block_desc,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_b_k0_n_k1_grid_desc,
+                                                               b_b_k0_n_k1_block_desc,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+#endif

        // output: register to global memory
        {
@@ -647,7 +923,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                {c_block_desc_mblock_mperblock_nblock_nperblock,
                 make_multi_index(0, 0, 0, 0),
                 c_grid_desc_mblock_mperblock_nblock_nperblock,
-                 make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
                 c_element_op};

            constexpr auto mxdlperwave_forward_step =
@@ -716,6 +992,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            });
        }
    }
+
+    template <typename Layout>
+    struct LStr
+    {
+        static std::string Get() { return ""; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::RowMajor>
+    {
+        static std::string Get() { return "R"; }
+    };
+
+    template <>
+    struct LStr<ck::tensor_layout::gemm::ColumnMajor>
+    {
+        static std::string Get() { return "C"; }
+    };
+
+    static std::string GetTypeString()
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "GemmXdlSplitKCShuffle_"
+            << getGemmSpecializationString(GemmSpec) << "_"
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << "_"
+            << "B" << BlockSize << "_"
+            << "Vec" << ABlockTransferSrcScalarPerVector << "x"
+            << BBlockTransferSrcScalarPerVector << "x"
+            << CBlockTransferScalarPerVector_NWaveNPerXDL << "_"
+            << MPerBlock << "x"
+            << NPerBlock << "x"
+            << K0PerBlock << "x"
+            << K1 ;
+        // clang-format on
+
+        return str.str();
+    }
 };

 } // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -46,7 +46,8 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -49,7 +49,8 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop>(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -53,7 +53,8 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop>(

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor/static_tensor.hpp"

@@ -207,15 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            auto src_vector_container = src_vector_type{
                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};

-            // apply SrcElementwiseOperation on src_vector_container
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-                SrcData src_v;
-
-                src_element_op_(src_v, src_vector_container.template AsType<SrcData>()[i]);
-
-                src_vector_container.template AsType<SrcData>()(i) = src_v;
-            });
-
            // copy data from src_vector_container into src_thread_scratch_
            src_thread_scratch_tuple_(thread_scratch_id)
                .template SetAsType<src_vector_t>(
@@ -318,7 +310,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                constexpr auto data_idx_seq = generate_sequence_v2(
                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});

-                // TODO type_convert is not used yet!!!!!
                using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;

@@ -342,19 +333,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                    Number<num_dst_vector>{});

                // do data transpose
-                // TODO type_convert is not used yet!!!!!
                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
                    src_vector_refs, dst_vector_refs);
            });
        }
-        else
-        {
-            static_ford<SliceLengths>{}([&](auto idx) {
-                // convert from SrcData to DstData here
-                dst_thread_scratch_(idx) =
-                    type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
-            });
-        }
+
+        static_ford<SliceLengths>{}([&](auto idx) {
+            // apply the src elementwise op and convert to DstData under the hood if needed
+            DstData dst_v;
+            src_element_op_(dst_v, src_thread_scratch_tuple_[thread_scratch_id][idx]);
+            dst_thread_scratch_(idx) = dst_v;
+        });
 #endif
    }


--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -27,6 +27,8 @@ enum struct MfmaInstr
    mfma_f32_16x16x8bf16,
    mfma_i32_32x32x8i8,
    mfma_i32_16x16x16i8,
+    mfma_i32_32x32x16i8,
+    mfma_i32_16x16x32i8,
    mfma_f64_16x16x4f64
 };

@@ -386,6 +388,50 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
    }
 };

+template <>
+struct mfma_type<MfmaInstr::mfma_i32_32x32x16i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_32x32x16i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_i32_16x16x32i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_16x16x32i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
 template <>
 struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
 {
@@ -524,17 +570,29 @@ struct MfmaSelector
 #endif
    }

+#if defined(CK_USE_AMD_MFMA_GFX940)
+    template <>
+    static constexpr auto GetMfma<int8_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_i32_32x32x16i8;
+    }
+    template <>
+    static constexpr auto GetMfma<int8_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_i32_16x16x32i8;
+    }
+#else
    template <>
    static constexpr auto GetMfma<int8_t, 32, 32>()
    {
        return MfmaInstr::mfma_i32_32x32x8i8;
    }
-
    template <>
    static constexpr auto GetMfma<int8_t, 16, 16>()
    {
        return MfmaInstr::mfma_i32_16x16x16i8;
    }
+#endif

    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops>()>{};


--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -297,6 +297,44 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
    }
 };

+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_32x32x16i8;
+
+template <>
+struct intrin_mfma_i32_32x32x16i8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_i32_32x32x16_i8(bit_cast<int64_t>(reg_a),
+                                                  bit_cast<int64_t>(reg_b),
+                                                  reg_c.template AsType<int32x16_t>()[Number<0>{}],
+                                                  0,
+                                                  0,
+                                                  0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_16x16x32i8;
+
+template <>
+struct intrin_mfma_i32_16x16x32i8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x8_t& reg_a, const int8x8_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_i32_16x16x32i8(bit_cast<int64_t>(reg_a),
+                                                 bit_cast<int64_t>(reg_b),
+                                                 reg_c.template AsType<int32x4_t>()[Number<0>{}],
+                                                 0,
+                                                 0,
+                                                 0);
+    }
+};
+
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f64_16x16x4f64;

@@ -306,7 +344,7 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
    template <class FloatC>
    __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
    {
-#ifdef __gfx90a__
+#if defined(__gfx90a__) || defined(__gfx940__)
        reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
            reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
 #else

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -898,6 +898,8 @@ struct vector_type<T, 256>
    }
 };

+using int64_t = long;
+
 // fp64
 using double2_t = typename vector_type<double, 2>::type;
 using double4_t = typename vector_type<double, 4>::type;
@@ -974,37 +976,6 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
        uint32_t int32;
    } u = {x};

-    // When the exponent bits are not all 1s, then the value is zero, normal,
-    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-    // least significant bits of the float mantissa are greater than 0x8000,
-    // or if they are equal to 0x8000 and the least significant bit of the
-    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-    // has the value 0x7f, then incrementing it causes it to become 0x00 and
-    // the exponent is incremented by one, which is the next higher FP value
-    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
-    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-    // incrementing it causes it to become an exponent of 0xFF and a mantissa
-    // of 0x00, which is Inf, the next higher value to the unrounded value.
-    bool flag0 = ~u.int32 & 0x7f800000;
-
-    // When all of the exponent bits are 1, the value is Inf or NaN.
-    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-    // bit being 1. Signaling NaN is indicated by the most significant
-    // mantissa bit being 0 but some other bit(s) being 1. If any of the
-    // lower 16 bits of the mantissa are 1, we set the least significant bit
-    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-    // the bfloat16's mantissa bits are all 0.
-    bool flag1 = !flag0 && (u.int32 & 0xffff);
-
-    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
-    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
-
    return uint16_t(u.int32 >> 16);
 }

@@ -1062,6 +1033,63 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
    return type_convert<bhalf_t>(x_fp32);
 }

+// Declare a template function for bf16 conversion using RTN
+template <typename Y, typename X>
+__host__ __device__ constexpr Y bf16_convert_rtn(X x);
+
+// Convert fp32 to bf16 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {x};
+
+    // When the exponent bits are not all 1s, then the value is zero, normal,
+    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+    // least significant bits of the float mantissa are greater than 0x8000,
+    // or if they are equal to 0x8000 and the least significant bit of the
+    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+    // has the value 0x7f, then incrementing it causes it to become 0x00 and
+    // the exponent is incremented by one, which is the next higher FP value
+    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
+    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+    // incrementing it causes it to become an exponent of 0xFF and a mantissa
+    // of 0x00, which is Inf, the next higher value to the unrounded value.
+    bool flag0 = ~u.int32 & 0x7f800000;
+
+    // When all of the exponent bits are 1, the value is Inf or NaN.
+    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+    // bit being 1. Signaling NaN is indicated by the most significant
+    // mantissa bit being 0 but some other bit(s) being 1. If any of the
+    // lower 16 bits of the mantissa are 1, we set the least significant bit
+    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+    // the bfloat16's mantissa bits are all 0.
+    bool flag1 = !flag0 && (u.int32 & 0xffff);
+
+    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
+    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
+
+    return uint16_t(u.int32 >> 16);
+}
+
+// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(half_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+
+    return bf16_convert_rtn<bhalf_t>(x_fp32);
+}
+
 template <typename T>
 struct NumericLimits
 {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include <sstream>

+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
 #include "ck/library/utility/host_tensor.hpp"

@@ -66,8 +67,26 @@ struct ReferenceGemm : public device::BaseOperator
                    ADataType v_a;
                    BDataType v_b;

-                    arg.a_element_op_(v_a, arg.a_m_k_(m, k));
-                    arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    // use PassThrough instead of ConvertBF16RTN for reference calculation
+                    if constexpr(is_same_v<AElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
+                    }
+                    else
+                    {
+                        arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    }
+                    // same for B matrix
+                    if constexpr(is_same_v<BElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
+                    }
+                    else
+                    {
+                        arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    }

                    v_acc +=
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -117,20 +117,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);

-void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
-                                                              Empty_Tuple,
-                                                              int8_t,
-                                                              PassThrough,
-                                                              PassThrough,
-                                                              PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              GNHWC,
@@ -159,20 +145,21 @@ void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);

-void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              Empty_Tuple,
-                                                              GNHWK,
-                                                              int8_t,
-                                                              int8_t,
+                                                              NHWGK,
+                                                              BF16,
+                                                              BF16,
                                                              Empty_Tuple,
-                                                              int8_t,
+                                                              BF16,
                                                              PassThrough,
                                                              PassThrough,
                                                              PassThrough>>>& instances);
-// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+
 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
                                                              NHWGC,
@@ -187,6 +174,20 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                              PassThrough,
                                                              PassThrough>>>& instances);

+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
@@ -385,12 +386,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            {
                add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs);
            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
-            {
-                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
-            }
        }
        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
@@ -398,7 +393,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
-                // no instance
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
            }
            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                              is_same_v<OutDataType, half_t>)
@@ -409,12 +404,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                              is_same_v<WeiDataType, ck::bhalf_t> &&
                              is_same_v<OutDataType, ck::bhalf_t>)
            {
-                // no instance
-            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
-            {
-                // no instance
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
            }
        }
        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -68,6 +68,58 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);

+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 template <typename ALayout,
          typename BLayout,
          typename ELayout,
@@ -109,11 +161,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         is_same_v<ELayout, Row>)
            {
                add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
            {
                add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                              is_same_v<ELayout, Row>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -17,14 +17,14 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -36,10 +36,10 @@ void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(

 void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_F32_Tuple,
@@ -52,10 +52,10 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
 void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -68,10 +68,10 @@ void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -83,10 +83,10 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(

 void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_F32_Tuple,
@@ -99,10 +99,10 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_F32_Tuple,
@@ -154,9 +154,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
@@ -220,9 +220,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -17,14 +17,14 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
 void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_Tuple,
@@ -36,10 +36,10 @@ void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(

 void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -51,10 +51,10 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(

 void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -67,10 +67,10 @@ void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
 void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
    std::vector<
        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                      GNHWC,
+                                                      NHWGC,
                                                      GKYXC,
                                                      GK_Tuple,
-                                                      GNHWK,
+                                                      NHWGK,
                                                      int8_t,
                                                      int8_t,
                                                      I32_Tuple,
@@ -82,10 +82,10 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(

 void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -97,10 +97,10 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(

 void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
-                                                              GNHWC,
+                                                              NHWGC,
                                                              GKYXC,
                                                              GK_Tuple,
-                                                              GNHWK,
+                                                              NHWGK,
                                                              int8_t,
                                                              int8_t,
                                                              I32_Tuple,
@@ -152,9 +152,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
@@ -218,9 +218,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

-        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
-                     is_same_v<OutLayout, GNHWK>)
+                     is_same_v<OutLayout, NHWGK>)
        {
            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                         is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)