merge develop

31d2d52a · wangshaojie6 · 5718bc14 · 7c788e10 · 31d2d52a · 5718bc14
Commit 31d2d52a authored Sep 20, 2022 by wangshaojie6
20 changed files
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -6,9 +6,10 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
 find_package(hip REQUIRED PATHS /opt/rocm)
 message(STATUS "Build with HIP ${hip_VERSION}")
-add_subdirectory(01_gemm)
+# add all example subdir
-add_subdirectory(02_gemm_add_add_fastgelu)
+file(GLOB dir_list LIST_DIRECTORIES true *)
-add_subdirectory(03_gemm_layernorm)
+FOREACH(subdir ${dir_list})
-add_subdirectory(04_contraction)
+    IF(IS_DIRECTORY "${subdir}")
-add_subdirectory(05_layernorm)
+        add_subdirectory(${subdir})
-add_subdirectory(06_softmax)
+    ENDIF()
+ENDFOREACH()
--- a/example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
+++ b/example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using F16 = ck::half_t;
-using F32 = float;
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ADataType        = F16;
-using BDataType        = F16;
-using AccDataType      = F32;
-using CShuffleDataType = F16;
-using EDataType        = F16;
-using ALayout = Row;
-using BLayout = Col;
-using ELayout = Row;
-using AElementOp   = PassThrough;
-using BElementOp   = PassThrough;
-using CDEElementOp = PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmEPermuteXdl
-    // clang-format off
-//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |        |      Type|      Type|        Type|         DataType|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |        |          |          |            |                 |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |          |          |            |                 |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
-// clang-format on
-using ReferenceBatchedGemmInstance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
-                                                                                      BDataType,
-                                                                                      EDataType,
-                                                                                      AccDataType,
-                                                                                      AElementOp,
-                                                                                      BElementOp,
-                                                                                      CDEElementOp>;
-int main(int argc, char* argv[])
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    const int M = 256;
-    const int N = 128;
-    const int K = 64;
-    const int stride_A = K;
-    const int stride_B = K;
-    const int batch_stride_A = M * K;
-    const int batch_stride_B = K * N;
-    const int G0 = 16;
-    const int G1 = 8;
-    const int batch_count = G0 * G1;
-    // output layout - [G0, M, G1, N]
-    const int stride_G0 = M * G1 * N;
-    const int stride_G1 = N;
-    const int stride_M  = G1 * N;
-    const int stride_N  = 1;
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        exit(0);
-    }
-    // GEMM shape
-    ck::tensor_operation::device::BatchedGemmEPermuteDesc batched_gemm_e_permute_desc{
-        G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N};
-    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
-                                       std::size_t row,
-                                       std::size_t col,
-                                       std::size_t stride,
-                                       std::size_t batch_stride,
-                                       auto layout) {
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
-                                        std::vector<std::size_t>({batch_stride, stride, 1}));
-        }
-        else
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
-                                        std::vector<std::size_t>({batch_stride, 1, stride}));
-        }
-    };
-    Tensor<ADataType> a_g_m_k(
-        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
-    Tensor<BDataType> b_g_k_n(
-        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
-    auto f_host_e_tensor_descriptor = [](std::size_t G0_,
-                                         std::size_t G1_,
-                                         std::size_t M_,
-                                         std::size_t N_,
-                                         std::size_t stride_G0_,
-                                         std::size_t stride_G1_,
-                                         std::size_t stride_M_,
-                                         std::size_t stride_N_) {
-        return HostTensorDescriptor(
-            std::vector<std::size_t>({G0_, G1_, M_, N_}),
-            std::vector<std::size_t>({stride_G0_, stride_G1_, stride_M_, stride_N_}));
-    };
-    Tensor<EDataType> e_g0_g1_m_n_host_result(
-        f_host_e_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
-    Tensor<EDataType> e_g0_g1_m_n_device_result(
-        f_host_e_tensor_descriptor(G0, G1, M, N, stride_G0, stride_G1, stride_M, stride_N));
-    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
-    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
-    std::cout << "e_g0_g1_m_n: " << e_g0_g1_m_n_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        break;
-    default:
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        break;
-    }
-    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem e_device_buf(sizeof(EDataType) *
-                           e_g0_g1_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a_g_m_k.mData.data());
-    b_device_buf.ToDevice(b_g_k_n.mData.data());
-    auto a_element_op   = AElementOp{};
-    auto b_element_op   = BElementOp{};
-    auto cde_element_op = CDEElementOp{};
-    auto gemm    = DeviceGemmInstance{};
-    auto invoker = gemm.MakeInvoker();
-    // do GEM
-    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                      static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
-                                      M,
-                                      N,
-                                      K,
-                                      stride_A,
-                                      stride_B,
-                                      batch_stride_A,
-                                      batch_stride_B,
-                                      batched_gemm_e_permute_desc,
-                                      batch_count,
-                                      a_element_op,
-                                      b_element_op,
-                                      cde_element_op);
-    if(!gemm.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
-    std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
-                            sizeof(BDataType) * batch_count * K * N +
-                            sizeof(EDataType) * batch_count * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << gemm.GetTypeString() << std::endl;
-    bool pass = true;
-    if(do_verification)
-    {
-        e_device_buf.FromDevice(e_g0_g1_m_n_device_result.mData.data());
-        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
-        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
-        Tensor<EDataType> c_g_m_n_host_result = HostTensorDescriptor(
-            std::vector<std::size_t>({batch_count, M, N}), std::vector<std::size_t>({M * N, N, 1}));
-        auto ref_argument = ref_batched_gemm.MakeArgument(
-            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, cde_element_op);
-        ref_invoker.Run(ref_argument);
-        for(int g0 = 0; g0 < G0; g0++)
-        {
-            for(int g1 = 0; g1 < G1; g1++)
-            {
-                for(int m = 0; m < M; m++)
-                {
-                    for(int n = 0; n < N; n++)
-                    {
-                        int g = g0 * G1 + g1;
-                        e_g0_g1_m_n_host_result(g0, g1, m, n) = c_g_m_n_host_result(g, m, n);
-                    }
-                }
-            }
-        }
-        pass = ck::utils::check_err(e_g0_g1_m_n_host_result.mData,
-                                    e_g0_g1_m_n_device_result.mData,
-                                    "Error: Incorrect results c");
-    }
-    return pass ? 0 : 1;
-}
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
-        using BiasLayout     = ctc::G_NW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
-        using BiasLayout     = ctc::G_NHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
@@ -332,7 +332,7 @@ int main(int argc, char* argv[])
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
-        using BiasLayout     = ctc::G_NDHW_K;
+        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;

--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
 add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
 add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-add_example_executable(example_padded_batched_gemm_scale_softmax_gemm_xdl_fp16 padded_batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
+add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
 add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
-add_custom_target(example_batched_gemm_scale_softmax_gemm)
+add_custom_target(example_gemm_scale_softmax_gemm)
-add_dependencies(example_batched_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16)
-add_dependencies(example_batched_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16)
-add_dependencies(example_batched_gemm_scale_softmax_gemm example_padded_batched_gemm_scale_softmax_gemm_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16)
-add_dependencies(example_batched_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -150,8 +150,8 @@ int main(int argc, char* argv[])
    // GEMM shape for A/B0/B1/C
    // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o
-    ck::index_t M             = 128;
+    ck::index_t M             = 120;
-    ck::index_t N             = 1024;
+    ck::index_t N             = 1000;
    ck::index_t K             = 64;
    ck::index_t O             = 128;
    ck::index_t StrideA       = -1;

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -55,7 +55,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
 using B1ElementOp   = PassThrough;
 using CElementOp    = PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
    ALayout,
@@ -73,7 +73,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
    Acc0ElementOp,
    B1ElementOp,
    CElementOp,
-    GemmDefault,
+    GemmSpec,
    1,
    256,
    128,         // MPerBlock
@@ -113,7 +113,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
    1,              // CShuffleMXdlPerWavePerShuffle
    2,              // CShuffleNXdlPerWavePerShuffle
    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+    8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+    false>;
 // Ref Gemm0: fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
@@ -144,8 +145,8 @@ int main(int argc, char* argv[])
    bool time_kernel     = false;
    // GEMM shape
-    ck::index_t M             = 1024;
+    ck::index_t M             = 1020;
-    ck::index_t N             = 1024;
+    ck::index_t N             = 1020;
    ck::index_t K             = 64;
    ck::index_t O             = 128;
    ck::index_t BatchCount    = 4;

--- a/example/32_batched_gemm_scale_softmax_gemm/padded_batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/padded_batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -16,7 +16,8 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -47,7 +48,9 @@ using CDataType        = F16;
 using ALayout  = Row;
 using B0Layout = Col;
 using B1Layout = Row;
-using CLayout  = Row;
+using CPermuteNumDims_G_M_O =
+    S<1, 1, 1>; // "using CLayout = Row" has been replaced by CPermuteNumDims_M_O
 using AElementOp    = PassThrough;
 using B0ElementOp   = PassThrough;
@@ -55,65 +58,67 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
 using B1ElementOp   = PassThrough;
 using CElementOp    = PassThrough;
-static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+using DeviceGemmInstance =
-    ALayout,
+    ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
-    B0Layout,
+        ALayout,
-    B1Layout,
+        B0Layout,
-    CLayout,
+        B1Layout,
-    ADataType,
+        CPermuteNumDims_G_M_O,
-    B0DataType,
+        ADataType,
-    B1DataType,
+        B0DataType,
-    CDataType,
+        B1DataType,
-    AccDataType,
+        CDataType,
-    CShuffleDataType,
+        AccDataType,
-    AElementOp,
+        CShuffleDataType,
-    B0ElementOp,
+        AElementOp,
-    Acc0ElementOp,
+        B0ElementOp,
-    B1ElementOp,
+        Acc0ElementOp,
-    CElementOp,
+        B1ElementOp,
-    MNPadding,
+        CElementOp,
-    1,
+        GemmSpec,
-    256,
+        1,
-    128,         // MPerBlock
+        256,
-    128,         // NPerBlock
+        128,         // MPerBlock
-    32,          // KPerBlock
+        128,         // NPerBlock
-    64,          // Gemm1NPerBlock
+        32,          // KPerBlock
-    32,          // Gemm1KPerBlock
+        64,          // Gemm1NPerBlock
-    8,           // AK1
+        32,          // Gemm1KPerBlock
-    8,           // BK1
+        8,           // AK1
-    2,           // B1K1
+        8,           // BK1
-    32,          // MPerXDL
+        2,           // B1K1
-    32,          // NPerXDL
+        32,          // MPerXDL
-    1,           // MXdlPerWave
+        32,          // NPerXDL
-    4,           // NXdlPerWave
+        1,           // MXdlPerWave
-    2,           // Gemm1NXdlPerWave
+        4,           // NXdlPerWave
-    S<4, 64, 1>, // ABlockTransfer
+        2,           // Gemm1NXdlPerWave
-    S<1, 0, 2>,
+        S<4, 64, 1>, // ABlockTransfer
-    S<1, 0, 2>,
+        S<1, 0, 2>,
-    2,
+        S<1, 0, 2>,
-    8,
+        2,
-    8,
+        8,
-    true,
+        8,
-    S<4, 64, 1>, // BBlockTransfer
+        true,
-    S<1, 0, 2>,
+        S<4, 64, 1>, // BBlockTransfer
-    S<1, 0, 2>,
+        S<1, 0, 2>,
-    2,
+        S<1, 0, 2>,
-    8,
+        2,
-    8,
+        8,
-    true,
+        8,
-    S<16, 16, 1>, // B1BlockTransfer
+        true,
-    S<0, 2, 1>,
+        S<16, 16, 1>, // B1BlockTransfer
-    S<0, 2, 1>,
+        S<0, 2, 1>,
-    1,
+        S<0, 2, 1>,
-    4,
+        1,
-    2,
+        4,
-    false,
+        2,
-    1,              // CShuffleMXdlPerWavePerShuffle
+        false,
-    2,              // CShuffleNXdlPerWavePerShuffle
+        1,              // CShuffleMXdlPerWavePerShuffle
-    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        2,              // CShuffleNXdlPerWavePerShuffle
-    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        false>;
 // Ref Gemm0: fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
@@ -143,22 +148,6 @@ int main(int argc, char* argv[])
    int init_method      = 1;
    bool time_kernel     = false;
-    // GEMM shape
-    ck::index_t M             = 1020;
-    ck::index_t N             = 1020;
-    ck::index_t K             = 64;
-    ck::index_t O             = 128;
-    ck::index_t BatchCount    = 4;
-    ck::index_t StrideA       = -1;
-    ck::index_t StrideB0      = -1;
-    ck::index_t StrideB1      = -1;
-    ck::index_t StrideC       = -1;
-    ck::index_t BatchStrideA  = -1;
-    ck::index_t BatchStrideB0 = -1;
-    ck::index_t BatchStrideB1 = -1;
-    ck::index_t BatchStrideC  = -1;
-    float alpha               = 1;
    if(argc == 1)
    {
        // use default case
@@ -169,74 +158,58 @@ int main(int argc, char* argv[])
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
-    else if(argc == 9)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        O = std::stoi(argv[7]);
-        BatchCount = std::stoi(argv[8]);
-    }
-    else if(argc == 18)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        O = std::stoi(argv[7]);
-        BatchCount = std::stoi(argv[8]);
-        StrideA  = std::stoi(argv[9]);
-        StrideB0 = std::stoi(argv[10]);
-        StrideB1 = std::stoi(argv[11]);
-        StrideC  = std::stoi(argv[12]);
-        BatchStrideA  = std::stoi(argv[13]);
-        BatchStrideB0 = std::stoi(argv[14]);
-        BatchStrideB1 = std::stoi(argv[15]);
-        BatchStrideC  = std::stoi(argv[16]);
-        alpha = std::stof(argv[17]);
-    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 16: M, N, K, O, Batch, StrideA, StrideB0, StrideB1, StrideC, BatchStrideA, "
-               "BatchStrideB0, BatchStrideB1, BatchStrideC\n");
-        printf("arg17: scale (alpha)\n");
        exit(0);
    }
-    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+    float alpha = 1; // scaling after 1st gemm
-    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
-    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
-    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
-    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
+    std::size_t group_count = 13;
-    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
-    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
-    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
-    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+    // Problem descs
-    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    std::vector<DeviceGemmInstance::ProblemDesc> problem_descs;
-    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    std::vector<const void*> p_a;
-    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
+    std::vector<const void*> p_b0;
+    std::vector<const void*> p_b1;
+    std::vector<void*> p_c;
-    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
+    for(std::size_t i = 0; i < group_count; i++)
-    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    {
-    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+        int M     = 128 * (rand() % 8 + 1);
-    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+        int N     = 128 * (rand() % 8 + 1);
+        int K     = 40;
+        int O     = 40 * (rand() % 2 + 1);
+        int Batch = rand() % 8 + 1;
+        const int StrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+        const int StrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+        const int StrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+        const int BatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+        const int BatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+        const int BatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+        std::vector<ck::index_t> c_gs_ms_os_lengths{Batch, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides{O, Batch * O, 1};
+        problem_descs.push_back({M,
+                                 N,
+                                 K,
+                                 O,
+                                 Batch,
+                                 StrideA,
+                                 StrideB0,
+                                 StrideB1,
+                                 BatchStrideA,
+                                 BatchStrideB0,
+                                 BatchStrideB1,
+                                 c_gs_ms_os_lengths,
+                                 c_gs_ms_os_strides});
+    }
    auto f_host_tensor_descriptor = [](std::size_t batch_count,
                                       std::size_t row,
@@ -256,56 +229,108 @@ int main(int argc, char* argv[])
        }
    };
-    // C_m_o = A_m_k * B0_k_n * B1_n_o
+    std::vector<Tensor<ADataType>> a_tensors;
-    Tensor<ADataType> a_g_m_k(
+    std::vector<Tensor<B0DataType>> b0_tensors;
-        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    std::vector<Tensor<B1DataType>> b1_tensors;
-    Tensor<B0DataType> b0_g_k_n(
+    std::vector<Tensor<CDataType>> c_tensors;
-        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
-    Tensor<B1DataType> b1_g_n_o(
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
-        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
-    Tensor<CDataType> c_g_m_o_host_result(
+    std::vector<DeviceMemPtr> a_tensors_device;
-        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    std::vector<DeviceMemPtr> b0_tensors_device;
-    Tensor<CDataType> c_g_m_o_device_result(
+    std::vector<DeviceMemPtr> b1_tensors_device;
-        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    std::vector<DeviceMemPtr> c_tensors_device;
-    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::size_t flop = 0, num_byte = 0;
-    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
-    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "group count " << group_count << ". printing first 4 groups\n";
-    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+    for(std::size_t i = 0; i < group_count; i++)
-    switch(init_method)
    {
-    case 0: break;
+        const auto& M                  = problem_descs[i].M;
-    case 1:
+        const auto& N                  = problem_descs[i].N;
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        const auto& K                  = problem_descs[i].K;
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        const auto& O                  = problem_descs[i].O;
-        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
+        const auto& Batch              = problem_descs[i].Batch;
-        break;
+        const auto& StrideA            = problem_descs[i].StrideA;
-    case 2:
+        const auto& StrideB0           = problem_descs[i].StrideB0;
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        const auto& StrideB1           = problem_descs[i].StrideB1;
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        const auto& BatchStrideA       = problem_descs[i].BatchStrideA;
-        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        const auto& BatchStrideB0      = problem_descs[i].BatchStrideB0;
-        break;
+        const auto& BatchStrideB1      = problem_descs[i].BatchStrideB1;
-    case 3:
+        const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths;
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides;
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
-        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        // C_m_o = A_m_k * B0_k_n * B1_n_o
-        break;
+        Tensor<ADataType> a_g_m_k(
-    default:
+            f_host_tensor_descriptor(Batch, M, K, StrideA, BatchStrideA, ALayout{}));
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        Tensor<B0DataType> b0_g_k_n(
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            f_host_tensor_descriptor(Batch, K, N, StrideB0, BatchStrideB0, B0Layout{}));
-        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        Tensor<B1DataType> b1_g_n_o(
-    }
+            f_host_tensor_descriptor(Batch, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+        Tensor<CDataType> c_gs_ms_os_device_result(
+            std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
+            std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
+        flop += (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * Batch;
+        num_byte += (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                    Batch;
+        if(i < 4)
+        {
+            std::cout << "a_g_m_k[" << i << "]: " << a_g_m_k.mDesc << ", "
+                      << "b0_g_k_n[" << i << "]: " << b0_g_k_n.mDesc << ", "
+                      << "b1_g_n_o[" << i << "]: " << b1_g_n_o.mDesc << ", "
+                      << "c_gs_ms_os[" << i << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
+        }
-    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+        switch(init_method)
-    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize());
+        {
-    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize());
+        case 0: break;
-    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) *
+        case 1:
-                                 c_g_m_o_device_result.mDesc.GetElementSpaceSize());
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+            break;
+        case 2:
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+            break;
+        case 3:
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+            break;
+        default:
+            a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+            b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        }
-    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+        a_tensors.push_back(a_g_m_k);
-    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+        b0_tensors.push_back(b0_g_k_n);
-    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+        b1_tensors.push_back(b1_g_n_o);
+        c_tensors.push_back(c_gs_ms_os_device_result);
+        a_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize()));
+        b0_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize()));
+        b1_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize()));
+        c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(CDataType) * c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()));
+        a_tensors_device[i]->ToDevice(a_g_m_k.mData.data());
+        b0_tensors_device[i]->ToDevice(b0_g_k_n.mData.data());
+        b1_tensors_device[i]->ToDevice(b1_g_n_o.mData.data());
+        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
+        p_b0.push_back(b0_tensors_device[i]->GetDeviceBuffer());
+        p_b1.push_back(b1_tensors_device[i]->GetDeviceBuffer());
+        p_c.push_back(c_tensors_device[i]->GetDeviceBuffer());
+    }
    auto a_element_op    = AElementOp{};
    auto b0_element_op   = B0ElementOp{};
@@ -314,31 +339,23 @@ int main(int argc, char* argv[])
    auto c_element_op    = CElementOp{};
    // do GEMM
-    auto gemm    = DeviceGemmInstance{};
+    auto gemm     = DeviceGemmInstance{};
-    auto invoker = gemm.MakeInvoker();
+    auto invoker  = gemm.MakeInvoker();
-    auto argument =
+    auto argument = gemm.MakeArgument(p_a,
-        gemm.MakeArgument(static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+                                      p_b0,
-                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+                                      p_b1,
-                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+                                      p_c,
-                          static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+                                      problem_descs,
-                          M,
+                                      a_element_op,
-                          N,
+                                      b0_element_op,
-                          K,
+                                      acc0_element_op,
-                          O,
+                                      b1_element_op,
-                          BatchCount,
+                                      c_element_op);
-                          StrideA,
-                          StrideB0,
+    // specify workspace for problem_desc
-                          StrideB1,
+    DeviceMem problem_desc_workspace(gemm.GetWorkSpaceSize(&argument));
-                          StrideC,
-                          BatchStrideA,
+    gemm.SetWorkSpacePointer(&argument, problem_desc_workspace.GetDeviceBuffer());
-                          BatchStrideB0,
-                          BatchStrideB1,
-                          BatchStrideC,
-                          a_element_op,
-                          b0_element_op,
-                          acc0_element_op,
-                          b1_element_op,
-                          c_element_op);
    if(!gemm.IsSupportedArgument(argument))
    {
@@ -349,49 +366,79 @@ int main(int argc, char* argv[])
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
-    std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
-                             sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
-                            BatchCount;
    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << gemm.GetTypeString() << std::endl;
-    c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+    bool pass = true;
    if(do_verification)
    {
-        // Output of Gemm0 is input A of Gemm1
+        for(std::size_t i = 0; i < group_count; i++)
-        Tensor<AccDataType> acc0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        {
+            const auto& M                  = problem_descs[i].M;
+            const auto& N                  = problem_descs[i].N;
+            const auto& O                  = problem_descs[i].O;
+            const auto& Batch              = problem_descs[i].Batch;
+            const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths;
+            const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides;
+            const auto& a_g_m_k            = a_tensors[i];
+            const auto& b0_g_k_n           = b0_tensors[i];
+            const auto& b1_g_n_o           = b1_tensors[i];
+            auto& c_gs_ms_os_device_result = c_tensors[i];
+            auto& c_gs_ms_os_device_buf    = *c_tensors_device[i];
+            Tensor<CDataType> c_gs_ms_os_host_result(
+                std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
+                std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
-        Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+            c_gs_ms_os_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
-        auto ref_gemm0          = ReferenceGemm0Instance{};
+            // Output of Gemm0 is input A of Gemm1
-        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+            Tensor<AccDataType> acc0_m_n(f_host_tensor_descriptor(Batch, M, N, N, M * N, Row{}));
-        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
-            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
-        ref_gemm0_invoker.Run(ref_gemm0_argument);
+            Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(Batch, M, N, N, M * N, Row{}));
-        auto ref_softmax          = ReferenceSoftmaxInstance{};
+            Tensor<CDataType> c_g_m_o_host_result(std::vector<int>{Batch, M, O},
-        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+                                                  std::vector<int>{M * O, O, 1});
-        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
-        ref_softmax_invoker.Run(ref_softmax_argument);
+            auto ref_gemm0          = ReferenceGemm0Instance{};
+            auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+            auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+                a_g_m_k, b0_g_k_n, acc0_m_n, a_element_op, b0_element_op, acc0_element_op);
-        auto ref_gemm1          = ReferenceGemm1Instance{};
+            ref_gemm0_invoker.Run(ref_gemm0_argument);
-        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
-        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
-            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
-        ref_gemm1_invoker.Run(ref_gemm1_argument);
+            auto ref_softmax          = ReferenceSoftmaxInstance{};
+            auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+            auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_m_n, a1_g_m_n, 1, 0, {2});
-        return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData) ? 0 : 1;
+            ref_softmax_invoker.Run(ref_softmax_argument);
+            auto ref_gemm1          = ReferenceGemm1Instance{};
+            auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+            auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n,
+                                                             b1_g_n_o,
+                                                             c_g_m_o_host_result,
+                                                             PassThrough{},
+                                                             b1_element_op,
+                                                             c_element_op);
+            ref_gemm1_invoker.Run(ref_gemm1_argument);
+            // Note: in this example, we merely permute the dimensions by changing underlying
+            // strides so we simply access data as-is
+            c_gs_ms_os_host_result.ForEach(
+                [&](auto& self, auto idx) { self(idx) = c_g_m_o_host_result(idx); });
+            bool pass_ =
+                ck::utils::check_err(c_gs_ms_os_device_result.mData, c_gs_ms_os_host_result.mData);
+            pass &= pass_;
+        }
    }
-    return 0;
+    return pass ? 0 : 1;
 }
--- a/example/38_grouped_conv_bwd_data_bias_relu/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_bias_relu/CMakeLists.txt
+add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
--- a/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_common.hpp
+++ b/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+template <ck::index_t NDimSpatial,
+          typename OutDataType,
+          typename WeiDataType,
+          typename BiasDataType,
+          typename InDataType,
+          typename OutElementOp,
+          typename WeiElementOp,
+          typename InElementOp,
+          typename DeviceInstance>
+int run_conv_bwd_data_bias_relu(bool do_verification,
+                                int init_method,
+                                bool time_kernel,
+                                const ck::utils::conv::ConvParam& conv_param,
+                                const HostTensorDescriptor& out_g_n_k_wos_desc,
+                                const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                                const HostTensorDescriptor& bias_g_n_c_wis_desc,
+                                const HostTensorDescriptor& in_g_n_c_wis_desc,
+                                const OutElementOp& out_element_op,
+                                const WeiElementOp& wei_element_op,
+                                const InElementOp& in_element_op)
+{
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_n_c_wis_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_device(in_g_n_c_wis_desc);
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{0.0, 1.0});
+    }
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+    // reset input to zero
+    in_device_buf.SetZero();
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_n_c_wis_desc.GetLengths(), d0_g_n_c_wis_lengths);
+    copy(bias_g_n_c_wis_desc.GetStrides(), d0_g_n_c_wis_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // do conv
+    auto conv     = DeviceInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        out_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+        in_device_buf.GetDeviceBuffer(),
+        a_g_n_k_wos_lengths,
+        a_g_n_k_wos_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_lengths},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_strides},
+        e_g_n_c_wis_lengths,
+        e_g_n_c_wis_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        out_element_op,
+        wei_element_op,
+        in_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        printf("wrong! device_conv with the specified compilation parameters does "
+               "not support this Conv problem\n");
+        return 1;
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+    if(do_verification)
+    {
+        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+        // c doesn't physically exist, any layout is fine
+        Tensor<float> c_host(in_g_n_c_wis_desc);
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         float,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         PassThrough,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>();
+        auto ref_invoker = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(c_host,
+                                                  wei,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  PassThrough{},
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+        // TODO: implement elementwise operation for host
+        in_host.ForEach(
+            [&](auto&, auto idx) { in_element_op(in_host(idx), c_host(idx), bias(idx)); });
+        in_device_buf.FromDevice(in_device.mData.data());
+        return ck::utils::check_err(in_device.mData, in_host.mData) ? 0 : 1;
+    }
+    return 0;
+}
--- a/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "grouped_conv_bwd_data_bias_relu_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using OutDataType      = ck::half_t;
+using WeiDataType      = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using BiasDataType     = ck::half_t; // bias
+using InDataType       = ck::half_t;
+using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout = ck::tensor_layout::convolution::G_C;
+using InLayout   = ck::tensor_layout::convolution::GNHWC;
+using OutElementOp     = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp     = ck::tensor_operation::element_wise::PassThrough;
+using CBiasInElementOp = ck::tensor_operation::element_wise::AddRelu;
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+template <ck::index_t NDimSpatial>
+using DeviceConvNdBwdDataInstance =
+    ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<
+        NDimSpatial,
+        OutLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        InLayout,
+        OutDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType>,
+        InDataType,
+        OutElementOp,
+        WeiElementOp,
+        CBiasInElementOp,
+        ConvBwdDataDefault,
+        true, // DoPadGemmM
+        true, // DoPadGemmN
+        1,
+        256,
+        128,
+        256,
+        32,
+        8,
+        2,
+        32,
+        32,
+        2,
+        4,
+        S<4, 64, 1>,
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        1,
+        S<4, 64, 1>,
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        0,
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 256, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = CBiasInElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(conv_param.num_dim_spatial_ == 2)
+    {
+        // output image: GNHWK
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+        // weight: GKYXC
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+        // input image bias: G_C
+        const auto bias_g_n_c_wis_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.C_,
+                                  conv_param.input_spatial_lengths_[0],
+                                  conv_param.input_spatial_lengths_[1]},
+                                 {
+                                     conv_param.C_, // g
+                                     0,             // n
+                                     1,             // c
+                                     0,             // hi
+                                     0              // wi
+                                 });
+        // input image: GNHWC
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        using DeviceInstance = DeviceConvNdBwdDataInstance<2>;
+        run_conv_bwd_data_bias_relu<2,
+                                    OutDataType,
+                                    WeiDataType,
+                                    BiasDataType,
+                                    InDataType,
+                                    OutElementOp,
+                                    WeiElementOp,
+                                    CBiasInElementOp,
+                                    DeviceInstance>(do_verification,
+                                                    init_method,
+                                                    time_kernel,
+                                                    conv_param,
+                                                    out_g_n_k_wos_desc,
+                                                    wei_g_k_c_xs_desc,
+                                                    bias_g_n_c_wis_desc,
+                                                    in_g_n_c_wis_desc,
+                                                    wei_element_op,
+                                                    out_element_op,
+                                                    in_element_op);
+    }
+    return 0;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -21,36 +21,10 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    add_dependencies(examples ${EXAMPLE_NAME})
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)
-add_subdirectory(01_gemm)
+# add all example subdir
-add_subdirectory(02_gemm_bilinear)
+file(GLOB dir_list LIST_DIRECTORIES true *)
-add_subdirectory(03_gemm_bias_relu)
+FOREACH(subdir ${dir_list})
-add_subdirectory(04_gemm_add_add_fastgelu)
+    IF(IS_DIRECTORY "${subdir}")
-add_subdirectory(09_convnd_fwd)
+        add_subdirectory(${subdir})
-add_subdirectory(10_convnd_fwd_multiple_d_multiple_reduce)
+    ENDIF()
-add_subdirectory(12_reduce)
+ENDFOREACH()
-add_subdirectory(13_pool2d_fwd)
-add_subdirectory(14_gemm_xdl_requant_relu_requant)
-add_subdirectory(15_grouped_gemm)
-add_subdirectory(16_gemm_multi_d_multi_reduces)
-add_subdirectory(17_convnd_bwd_data)
-add_subdirectory(18_batched_gemm_reduce)
-add_subdirectory(19_binary_elementwise)
-add_subdirectory(20_convnd_bwd_weight)
-add_subdirectory(21_gemm_layernorm)
-add_subdirectory(22_cgemm)
-add_subdirectory(23_softmax)
-add_subdirectory(24_batched_gemm)
-add_subdirectory(25_gemm_bias_e_permute)
-add_subdirectory(26_contraction)
-add_subdirectory(27_layernorm)
-add_subdirectory(28_grouped_gemm_bias_e_permute)
-add_subdirectory(29_batched_gemm_bias_e_permute)
-add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
-add_subdirectory(31_batched_gemm_gemm)
-add_subdirectory(32_batched_gemm_scale_softmax_gemm)
-add_subdirectory(33_multiple_reduce)
-add_subdirectory(34_batchnorm)
-add_subdirectory(35_splitK_gemm)
-add_subdirectory(36_sparse_embedding)
-add_subdirectory(37_batched_gemm_add_add_relu_gemm_add)
-add_subdirectory(41_grouped_conv_conv_fwd)
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -649,6 +649,9 @@ struct BlockwiseGemmXdlops_v2
    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+    static_assert(KPerThread % KPack == 0,
+                  "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                              FloatAcc,
                              MRepeat * NRepeat,

--- a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -549,10 +549,6 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,
@@ -586,12 +582,19 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -719,10 +722,9 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // block-to-e-tile map
        Block2ETileMap block_2_etile_map_;
@@ -786,10 +788,10 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
                    CDEElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
                    ComputePtrOffsetOfStridedBatch,
-                    typename GridwiseGemm::DefaultBlock2ETileMap,
+                    DeviceOp::Block2ETileMap,
                    has_main_loop>;
                return launch_and_time_kernel(stream_config,

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
-#pragma once
-#include <iostream>
-#include <sstream>
-#include "ck/utility/common_header.hpp"
-#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-/*
- * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
- *
- * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
- * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
- * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
- * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
-#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
- * limitations.
- *
- * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
- * returns the 2D index of the tile that it computes. \see
- * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
- * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
- * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
- * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
- * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
- * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
- * pointer offset into \p ComputePtrOffsetOfStridedBatch.
- *
- * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
- * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
- * realize BatchedGemmCPermute and GroupedGemm (and the corresponding GEMM fusion).
- *
- */
-template <typename GridwiseGemm,
-          typename ABDataType,
-          typename EDataType,
-          typename AGridDesc_AK0_M_AK1,
-          typename BGridDesc_BK0_N_BK1,
-          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          typename ComputePtrOffsetOfBatch,
-          typename Block2ETileMap,
-          bool HasMainKBlockLoop>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_batched_gemm_e_permute_xdl(const ABDataType* __restrict__ p_a_grid,
-                                          const ABDataType* __restrict__ p_b_grid,
-                                          EDataType* __restrict__ p_e_grid,
-                                          const index_t batch_count,
-                                          const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                          const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                          const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                          const AElementwiseOperation a_element_op,
-                                          const BElementwiseOperation b_element_op,
-                                          const CDEElementwiseOperation cde_element_op,
-                                          const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-                                          const Block2ETileMap block_2_etile_map)
-{
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  ck::Tuple<>{},
-                                                  p_e_grid + e_batch_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ck::Tuple<>{},
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
-#else
-    ignore = p_a_grid;
-    ignore = p_b_grid;
-    ignore = p_e_grid;
-    ignore = batch_count;
-    ignore = a_grid_desc_ak0_m_ak1;
-    ignore = b_grid_desc_bk0_n_bk1;
-    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = a_element_op;
-    ignore = b_element_op;
-    ignore = cde_element_op;
-    ignore = compute_ptr_offset_of_batch;
-    ignore = block_2_etile_map;
-#endif
-}
-template <typename ALayout,
-          typename BLayout,
-          typename ELayout,
-          typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CShuffleDataType,
-          typename EDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          GemmSpecialization GemmSpec,
-          index_t NumPrefetch,
-          index_t BlockSize,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t AK1,
-          index_t BK1,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MXdlPerWave,
-          index_t NXdlPerWave,
-          typename ABlockTransferThreadClusterLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          index_t ABlockTransferSrcVectorDim,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t ABlockTransferDstScalarPerVector_K1,
-          index_t ABlockLdsExtraM,
-          typename BBlockTransferThreadClusterLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          index_t BBlockTransferSrcVectorDim,
-          index_t BBlockTransferSrcScalarPerVector,
-          index_t BBlockTransferDstScalarPerVector_K1,
-          index_t BBlockLdsExtraN,
-          index_t CShuffleMXdlPerWavePerShuffle,
-          index_t CShuffleNXdlPerWavePerShuffle,
-          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
-                                                                       BLayout,
-                                                                       ELayout,
-                                                                       ADataType,
-                                                                       BDataType,
-                                                                       EDataType,
-                                                                       AElementwiseOperation,
-                                                                       BElementwiseOperation,
-                                                                       CDEElementwiseOperation>
-{
-    using DeviceOp = DeviceBatchedGemmEPermuteXdl;
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto matrix_padder =
-        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
-    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(I1, StrideA));
-            }
-        }();
-        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
-    }
-    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(I1, StrideB));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(StrideB, I1));
-            }
-        }();
-        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
-    }
-    static auto
-    MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t stride_M, index_t stride_N)
-    {
-        const auto e_grid_desc_mraw_nraw =
-            make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), make_tuple(stride_M, stride_N));
-        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
-    }
-    static auto MakeEGridDescriptor_G0_G1_M_N(index_t G0,
-                                              index_t G1,
-                                              index_t MRaw,
-                                              index_t NRaw,
-                                              index_t stride_G0,
-                                              index_t stride_G1,
-                                              index_t stride_M,
-                                              index_t stride_N)
-    {
-        const auto e_grid_desc_g0_g1_mraw_nraw = [&]() {
-            return make_naive_tensor_descriptor(
-                make_tuple(G0, G1, MRaw, NRaw),
-                make_tuple(stride_G0, stride_G1, stride_M, stride_N));
-        }();
-        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
-        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
-        const auto MPad = M - MRaw;
-        const auto NPad = N - NRaw;
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(
-                e_grid_desc_g0_g1_mraw_nraw,
-                make_tuple(make_pass_through_transform(G0),
-                           make_pass_through_transform(G1),
-                           make_right_pad_transform(MRaw, MPad),
-                           make_right_pad_transform(NRaw, NPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                e_grid_desc_g0_g1_mraw_nraw,
-                make_tuple(make_pass_through_transform(G0),
-                           make_pass_through_transform(G1),
-                           make_right_pad_transform(MRaw, MPad),
-                           make_pass_through_transform(NRaw)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                e_grid_desc_g0_g1_mraw_nraw,
-                make_tuple(make_pass_through_transform(G0),
-                           make_pass_through_transform(G1),
-                           make_pass_through_transform(MRaw),
-                           make_right_pad_transform(NRaw, NPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return e_grid_desc_g0_g1_mraw_nraw;
-        }
-    }
-    using AGridDesc_M_K       = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
-    using BGridDesc_N_K       = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
-    using EGridDesc_M_N       = decltype(MakeEGridDescriptor_M_N(1, 1, 1, 1));
-    using EGridDesc_G0_G1_M_N = decltype(MakeEGridDescriptor_G0_G1_M_N(1, 1, 1, 1, 1, 1, 1, 1));
-    struct ComputePtrOffsetOfStridedBatch
-    {
-        ComputePtrOffsetOfStridedBatch(index_t Batchstride_A,
-                                       index_t Batchstride_B,
-                                       EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n)
-            : Batchstride_A_(Batchstride_A),
-              Batchstride_B_(Batchstride_B),
-              e_grid_desc_g0_g1_m_n_(e_grid_desc_g0_g1_m_n)
-        {
-        }
-        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
-        {
-            return g_idx * static_cast<long_index_t>(Batchstride_A_);
-        }
-        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
-        {
-            return g_idx * static_cast<long_index_t>(Batchstride_B_);
-        }
-        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
-        {
-            const index_t G1 = e_grid_desc_g0_g1_m_n_.GetLength(I1);
-            index_t b0       = g_idx / G1;
-            index_t b1       = g_idx - b0 * G1; // g_idx % G1
-            return e_grid_desc_g0_g1_m_n_.CalculateOffset(make_multi_index(b0, b1, 0, 0));
-        }
-        private:
-        index_t Batchstride_A_;
-        index_t Batchstride_B_;
-        EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n_;
-    };
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
-        ADataType, // TODO: distinguish A/B datatype
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<>, // DsDataType,
-        EDataType,   // EDataType,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        Tuple<>,
-        EGridDesc_M_N,
-        NumPrefetch,
-        BlockSize,
-        MPerBlock,
-        NPerBlock,
-        KPerBlock,
-        AK1,
-        BK1,
-        MPerXDL,
-        NPerXDL,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_K0_M_K1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_K1,
-        false, // AThreadTransferSrcResetCoordinateAfterRun,
-        ABlockLdsExtraM,
-        BBlockTransferThreadClusterLengths_K0_N_K1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_K1,
-        false, // BThreadTransferSrcResetCoordinateAfterRun,
-        BBlockLdsExtraN,
-        CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CDEBlockTransferScalarPerVector_NPerBlock,
-        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
-        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(
-        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ADataType* p_a_grid,
-                 const BDataType* p_b_grid,
-                 EDataType* p_e_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t stride_A,
-                 index_t stride_B,
-                 index_t batch_stride_A,
-                 index_t batch_stride_B,
-                 BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
-                 index_t BatchCount,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_e_grid_{p_e_grid},
-              BatchCount_(BatchCount),
-              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(M, K, stride_A)},
-              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(K, N, stride_B)},
-              e_grid_desc_m_n_{
-                  DeviceOp::MakeEGridDescriptor_M_N(batched_gemm_e_permute_desc.M_,
-                                                    batched_gemm_e_permute_desc.N_,
-                                                    batched_gemm_e_permute_desc.stride_M_,
-                                                    batched_gemm_e_permute_desc.stride_N_)},
-              a_grid_desc_ak0_m_ak1_{
-                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
-              b_grid_desc_bk0_n_bk1_{
-                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
-              e_grid_desc_mblock_mperblock_nblock_nperblock{},
-              e_grid_desc_g0_g1_m_n_{
-                  DeviceOp::MakeEGridDescriptor_G0_G1_M_N(batched_gemm_e_permute_desc.G0_,
-                                                          batched_gemm_e_permute_desc.G1_,
-                                                          batched_gemm_e_permute_desc.M_,
-                                                          batched_gemm_e_permute_desc.N_,
-                                                          batched_gemm_e_permute_desc.stride_G0_,
-                                                          batched_gemm_e_permute_desc.stride_G1_,
-                                                          batched_gemm_e_permute_desc.stride_M_,
-                                                          batched_gemm_e_permute_desc.stride_N_)},
-              compute_ptr_offset_of_batch_{batch_stride_A, batch_stride_B, e_grid_desc_g0_g1_m_n_},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                           b_grid_desc_n_k_,
-                                           ck::Tuple<>{},
-                                           e_grid_desc_m_n_,
-                                           block_2_etile_map_))
-            {
-                e_grid_desc_mblock_mperblock_nblock_nperblock =
-                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                        e_grid_desc_m_n_);
-            }
-        }
-        void Print() const
-        {
-            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
-            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
-            std::cout << "C[M, N]: " << e_grid_desc_m_n_ << std::endl;
-        }
-        //  private:
-        // pointers
-        const ADataType* p_a_grid_;
-        const BDataType* p_b_grid_;
-        EDataType* p_e_grid_;
-        // batch count
-        index_t BatchCount_;
-        // tensor descriptors for problem definiton
-        AGridDesc_M_K a_grid_desc_m_k_;
-        BGridDesc_N_K b_grid_desc_n_k_;
-        EGridDesc_M_N e_grid_desc_m_n_;
-        // tensor descriptors for block/thread-wise copy
-        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
-        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock;
-        EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n_;
-        // for calculating Batch offset
-        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
-        // block-to-e-tile map
-        Block2ETileMap block_2_etile_map_;
-        // element-wise op
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-    // Invoker
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceOp::Argument;
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                            arg.b_grid_desc_n_k_,
-                                            ck::Tuple<>{},
-                                            arg.e_grid_desc_m_n_,
-                                            arg.block_2_etile_map_))
-            {
-                throw std::runtime_error(
-                    "wrong! GridwiseBatchedGemmCPermute_km_kn_m0m1n0n1_xdlops_v2r3 has invalid "
-                    "setting");
-            }
-            const index_t grid_size =
-                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.BatchCount_;
-            const auto K =
-                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
-                const auto kernel = kernel_batched_gemm_e_permute_xdl<
-                    GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
-                    EDataType,
-                    remove_reference_t<DeviceOp::AGridDesc_AK0_M_AK1>,
-                    remove_reference_t<DeviceOp::BGridDesc_BK0_N_BK1>,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CDEElementwiseOperation,
-                    ComputePtrOffsetOfStridedBatch,
-                    remove_reference_t<Block2ETileMap>,
-                    has_main_k_block_loop_>;
-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              arg.p_a_grid_,
-                                              arg.p_b_grid_,
-                                              arg.p_e_grid_,
-                                              arg.BatchCount_,
-                                              arg.a_grid_desc_ak0_m_ak1_,
-                                              arg.b_grid_desc_bk0_n_bk1_,
-                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                              arg.a_element_op_,
-                                              arg.b_element_op_,
-                                              arg.cde_element_op_,
-                                              arg.compute_ptr_offset_of_batch_,
-                                              arg.block_2_etile_map_);
-            };
-            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
-            {
-                return launch_kernel(integral_constant<bool, true>{});
-            }
-            else
-            {
-                return launch_kernel(integral_constant<bool, false>{});
-            }
-        }
-        // polymorphic
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                           arg.b_grid_desc_n_k_,
-                                           ck::Tuple<>{},
-                                           arg.e_grid_desc_m_n_,
-                                           arg.block_2_etile_map_);
-    }
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-    static auto MakeArgument(const ADataType* p_a,
-                             const BDataType* p_b,
-                             EDataType* p_e,
-                             index_t M,
-                             index_t N,
-                             index_t K,
-                             index_t stride_A,
-                             index_t stride_B,
-                             index_t batch_stride_A,
-                             index_t batch_stride_B,
-                             BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
-                             index_t BatchCount,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{p_a,
-                        p_b,
-                        p_e,
-                        M,
-                        N,
-                        K,
-                        stride_A,
-                        stride_B,
-                        batch_stride_A,
-                        batch_stride_B,
-                        batched_gemm_e_permute_desc,
-                        BatchCount,
-                        a_element_op,
-                        b_element_op,
-                        cde_element_op};
-    }
-    static auto MakeInvoker() { return Invoker{}; }
-    // polymorphic
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_a,
-                        const void* p_b,
-                        void* p_e,
-                        index_t M,
-                        index_t N,
-                        index_t K,
-                        index_t stride_A,
-                        index_t stride_B,
-                        index_t batch_stride_A,
-                        index_t batch_stride_B,
-                        BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
-                        index_t BatchCount,
-                        AElementwiseOperation a_element_op,
-                        BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op) override
-    {
-        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
-                                          static_cast<const BDataType*>(p_b),
-                                          static_cast<EDataType*>(p_e),
-                                          M,
-                                          N,
-                                          K,
-                                          stride_A,
-                                          stride_B,
-                                          batch_stride_A,
-                                          batch_stride_B,
-                                          batched_gemm_e_permute_desc,
-                                          BatchCount,
-                                          a_element_op,
-                                          b_element_op,
-                                          cde_element_op);
-    }
-    // polymorphic
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-    // polymorphic
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "DeviceBatchedGemmEPermuteXdl"
-            << "<"
-            << BlockSize << ", "
-            << MPerBlock << ", "
-            << NPerBlock << ", "
-            << KPerBlock
-            << ">";
-        // clang-format on
-        return str.str();
-    }
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -503,13 +503,9 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+            if(!DeviceOp::IsSupportedArgument(arg))
-                                            arg.b_grid_desc_bk0_n_bk1_,
-                                            arg.b1_grid_desc_bk0_n_bk1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
            {
-                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+                throw std::runtime_error("wrong! unsupported argument");
            }
            const index_t grid_size =

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
@@ -333,10 +333,6 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,
@@ -370,12 +366,19 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -478,10 +481,9 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // for calculating batch offset
        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
@@ -520,21 +522,21 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
            auto launch_kernel = [&](auto has_main_k_block_loop) {
                constexpr bool has_main_loop = has_main_k_block_loop.value;
-                const auto kernel = kernel_batched_gemm_xdl<
+                const auto kernel =
-                    GridwiseGemm,
+                    kernel_batched_gemm_xdl<GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
+                                            ADataType, // TODO: distiguish A/B datatype
-                    typename GridwiseGemm::DsGridPointer,
+                                            typename GridwiseGemm::DsGridPointer,
-                    EDataType,
+                                            EDataType,
-                    AElementwiseOperation,
+                                            AElementwiseOperation,
-                    BElementwiseOperation,
+                                            BElementwiseOperation,
-                    CDEElementwiseOperation,
+                                            CDEElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    ComputePtrOffsetOfStridedBatch,
+                                            ComputePtrOffsetOfStridedBatch,
-                    Block2ETileMap,
+                                            Block2ETileMap,
-                    has_main_loop>;
+                                            has_main_loop>;
                return launch_and_time_kernel(stream_config,
                                              kernel,