Merge branch 'develop' into wavelet_model

b89a88b5 · Adam Osewski · 41d5fca7 · 43c898f6 · b89a88b5 · b89a88b5
Commit b89a88b5 authored Sep 19, 2022 by Adam Osewski
20 changed files
--- a/example/35_splitK_gemm/CMakeLists.txt
+++ b/example/35_splitK_gemm/CMakeLists.txt
+add_custom_target(example_splitK_gemm_xdl)
+
+add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
+add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
+
+add_dependencies(example_splitK_gemm_xdl
+                 example_splitK_gemm_xdl_fp32
+                 example_splitK_gemm_xdl_fp16
+                 example_splitK_gemm_xdl_bfp16
+                 example_splitK_gemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
+  add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+endif()
--- a/example/35_splitK_gemm/run_splitK_gemm_example.inc
+++ b/example/35_splitK_gemm/run_splitK_gemm_example.inc
+#pragma once
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+    ck::index_t stride_C = N;
+
+    ck::index_t k_batch = 4;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+    static_assert(sizeof(ADataType) == sizeof(KernelADataType));
+    static_assert(sizeof(BDataType) == sizeof(KernelBDataType));
+#endif
+
+    auto& [M, N, K, StrideA, StrideB, StrideC, KBatch] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+    }
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<KernelADataType> a_m_k_converted(a_m_k);
+    const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
+
+    a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data());
+#else
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+#endif
+    c_m_n_device_buf.SetZero();
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+#endif
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        KBatch);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool pass = true;
+
+    if(config.do_verification)
+    {
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        if(std::is_same<CDataType, ck::half_t>::value)
+        {
+            pass &= ck::utils::check_err(c_m_n_device_result.mData,
+                                         c_m_n_host_result.mData,
+                                         "fp16 incorrect result",
+                                         3e-3,
+                                         1e-3);
+        }
+        else
+        {
+            pass &= ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        }
+    }
+
+    if(config.time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop = std::size_t(2) * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass;
+}
+
+bool run_splitK_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        problem_size.k_batch   = std::stoi(argv[4]);
+    }
+    else if(argc == 11)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        problem_size.k_batch   = std::stoi(argv[4]);
+
+        problem_size.M = std::stoi(argv[5]);
+        problem_size.N = std::stoi(argv[6]);
+        problem_size.K = std::stoi(argv[7]);
+
+        problem_size.stride_A = std::stoi(argv[8]);
+        problem_size.stride_B = std::stoi(argv[9]);
+        problem_size.stride_C = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4: KBatch\n");
+        printf("arg5 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    return run_splitK_gemm(problem_size, config);
+}
--- a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = BF16;
+using BDataType   = BF16;
+using AccDataType = F32;
+using CDataType   = F32;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = F32;
+using CDataType   = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
--- a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = F32;
+using BDataType   = F32;
+using AccDataType = F32;
+using CDataType   = F32;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = ck::int4_t;
+using BDataType   = ck::int4_t;
+using AccDataType = int32_t;
+using CDataType   = int32_t;
+
+using KernelADataType = int8_t;
+using KernelBDataType = int8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+        <KernelADataType,      //ADataType    
+         KernelBDataType,      //BDataType   
+         CDataType,            //EDataType
+         AccDataType,          //AccDataType
+         ALayout,              //ALayout
+         BLayout,              //BLayout
+         CLayout,              //ELayout
+         AElementOp,           //AElementwiseOperation
+         BElementOp,           //BElementwiseOperation
+         CElementOp,           //CElementwiseOperation
+         GemmDefault,          //GEMMSpecialization
+         256,                  // BlockSize
+         256,                  // MPerBlock
+         128,                  // NPerBlock
+         4,                    // KPerBlock
+         16,                   // K1
+         32,                   // MPerXdl
+         32,                   // NPerXdl
+         4,                    // MXdlPerWave
+         2,                    // NXdlPerWave
+         S<1, 4, 64, 1>,       // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<0, 2, 1, 3>,        // ABlockTransfer ThreadCluster ArrangeOrder
+         S<0, 2, 1, 3>,        // ABlockTransfer SrcAccessOrder
+         3,                    // ABlockTransfer SrcVectorDim
+         16,                   // ABlockTransfer SrcScalarPerVector
+         16,                   // ABlockTransfer DstScalarPerVector_K1
+         true,                 // ABlockLdsExtraM
+         S<1, 4, 64, 1>,       // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<0, 1, 3, 2>,        // BBlockTransfer ThreadCluster ArrangeOrder
+         S<0, 1, 3, 2>,        // BBlockTransfer SrcAccessOrder
+         3,                    // BBlockTransfer SrcVectorDim
+         16,                   // BBlockTransfer SrcScalarPerVector
+         16,                   // BBlockTransfer DstScalarPerVector_K1
+         true,                 // BBlockLdsExtraN
+         1,                    // CShuffleMXdlPerWavePerShuffle
+         1,                    // CShuffleNXdlPerWavePerShuffle
+         S<1, 32, 1, 8>,       // CBlockTransferClusterLengths _MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+         4>;                   // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+#define BUILD_INT4_EXAMPLE
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
--- a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = int8_t;
+using BDataType   = int8_t;
+using AccDataType = int32_t;
+using CDataType   = int32_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,             16,             16,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,             16,             16,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
--- a/example/36_sparse_embedding/CMakeLists.txt
+++ b/example/36_sparse_embedding/CMakeLists.txt
+add_example_executable(example_sparse_embedding3_forward_layernorm sparse_embedding3_forward_layernorm.cpp)
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include <ctime>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_sparse_embedding3_forward_layernorm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp"
+
+// using EmbType       = float;
+// using IndexType     = int64_t;
+// using GammaDataType = float;
+// using BetaDataType  = float;
+// using AccDataType   = float;
+// using OutType       = float;
+
+using EmbType       = ck::half_t;
+using IndexType     = int64_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using AccDataType   = float;
+using OutType       = ck::half_t;
+
+// clang-format off
+//                                                                                                         BlockSize, DimClusterSize, RowClusterSize, DimPerBlock, RowPerBlock, DimThreadSize, RowVectorSize
+using DeviceInstance_fp32_e256   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  256,   1, 1>;
+using DeviceInstance_fp32_e512   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  512,   1, 1>;
+using DeviceInstance_fp32_e768   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  768,   1, 1>;
+using DeviceInstance_fp32_e1024  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1024,  1, 1>;
+using DeviceInstance_fp32_e1536  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1536,  1, 1>;
+using DeviceInstance_fp32_e2048  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  2048,  1, 4>;
+using DeviceInstance_fp32_e4096  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  4096,  1, 4>;
+using DeviceInstance_fp32_e8192  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  8192,  1, 4>;
+using DeviceInstance_fp32_e16384 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  16384, 1, 4>;
+
+using DeviceInstance_fp16_e256   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  256,   1, 1>;
+using DeviceInstance_fp16_e512   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  512,   1, 2>;
+using DeviceInstance_fp16_e768   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  768,   1, 1>;
+using DeviceInstance_fp16_e1024  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1024,  1, 2>;
+using DeviceInstance_fp16_e1536  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1536,  1, 2>;
+using DeviceInstance_fp16_e2048  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  2048,  1, 2>;
+using DeviceInstance_fp16_e4096  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  4096,  1, 8>;
+using DeviceInstance_fp16_e8192  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  8192,  1, 8>;
+
+template<typename emb_type, ck::index_t dim> struct emb_kernel{};
+
+template<> struct emb_kernel<float, 256>  { using kernel_type = DeviceInstance_fp32_e256; };
+template<> struct emb_kernel<float, 512>  { using kernel_type = DeviceInstance_fp32_e512; };
+template<> struct emb_kernel<float, 768>  { using kernel_type = DeviceInstance_fp32_e768; };
+template<> struct emb_kernel<float, 1024> { using kernel_type = DeviceInstance_fp32_e1024;};
+template<> struct emb_kernel<float, 1536> { using kernel_type = DeviceInstance_fp32_e1536;};
+template<> struct emb_kernel<float, 2048> { using kernel_type = DeviceInstance_fp32_e2048;};
+template<> struct emb_kernel<float, 4096> { using kernel_type = DeviceInstance_fp32_e4096;};
+template<> struct emb_kernel<float, 8192> { using kernel_type = DeviceInstance_fp32_e8192;};
+template<> struct emb_kernel<float, 16384>{ using kernel_type = DeviceInstance_fp32_e16384;};
+
+template<> struct emb_kernel<ck::half_t, 256>  { using kernel_type = DeviceInstance_fp16_e256; };
+template<> struct emb_kernel<ck::half_t, 512>  { using kernel_type = DeviceInstance_fp16_e512; };
+template<> struct emb_kernel<ck::half_t, 768>  { using kernel_type = DeviceInstance_fp16_e768; };
+template<> struct emb_kernel<ck::half_t, 1024> { using kernel_type = DeviceInstance_fp16_e1024; };
+template<> struct emb_kernel<ck::half_t, 1536> { using kernel_type = DeviceInstance_fp16_e1536; };
+template<> struct emb_kernel<ck::half_t, 2048> { using kernel_type = DeviceInstance_fp16_e2048; };
+template<> struct emb_kernel<ck::half_t, 4096> { using kernel_type = DeviceInstance_fp16_e4096; };
+template<> struct emb_kernel<ck::half_t, 8192> { using kernel_type = DeviceInstance_fp16_e8192; };
+
+// clang-format on
+
+int main()
+{
+    bool time_kernel = true;
+
+    constexpr auto num_rows = 65536;
+    constexpr auto dims     = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
+    // constexpr auto dims = ck::Sequence<256, 512>{};
+    constexpr auto index_length   = 2048;
+    constexpr AccDataType epsilon = 1e-4;
+
+    auto f_host_tensor_desc_1d = [](std::size_t len_) {
+        return HostTensorDescriptor(std::vector<std::size_t>({len_}));
+    };
+
+    auto f_host_tensor_desc_2d = [](std::size_t rows_, std::size_t cols_) {
+        return HostTensorDescriptor(std::vector<std::size_t>({rows_, cols_}));
+    };
+
+    using ReferenceInstance =
+        ck::tensor_operation::host::ReferenceSparseEmbedding3ForwardLayernorm<EmbType,
+                                                                              IndexType,
+                                                                              GammaDataType,
+                                                                              BetaDataType,
+                                                                              AccDataType,
+                                                                              OutType>;
+
+    ck::static_for<0, dims.Size(), 1>{}([&](auto I) {
+        std::srand(std::time(nullptr));
+        constexpr auto current_dim = dims.At(I);
+        Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
+        Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
+        Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
+
+        Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
+        Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
+        Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
+
+        Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
+        Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
+
+        Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
+
+        emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+        emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+        emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+
+        index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+        index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+        index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+
+        DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
+        DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
+        DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
+
+        DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
+        DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
+        DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
+
+        DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+        DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+
+        DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
+
+        emb_a_dev.ToDevice(emb_a.mData.data());
+        emb_b_dev.ToDevice(emb_b.mData.data());
+        emb_c_dev.ToDevice(emb_c.mData.data());
+
+        index_a_dev.ToDevice(index_a.mData.data());
+        index_b_dev.ToDevice(index_b.mData.data());
+        index_c_dev.ToDevice(index_c.mData.data());
+
+        gamma_dev.ToDevice(gamma.mData.data());
+        beta_dev.ToDevice(beta.mData.data());
+
+        auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
+        auto argument_ptr    = device_instance.MakeArgumentPointer(out_dev.GetDeviceBuffer(),
+                                                                emb_a_dev.GetDeviceBuffer(),
+                                                                emb_b_dev.GetDeviceBuffer(),
+                                                                emb_c_dev.GetDeviceBuffer(),
+                                                                index_a_dev.GetDeviceBuffer(),
+                                                                index_b_dev.GetDeviceBuffer(),
+                                                                index_c_dev.GetDeviceBuffer(),
+                                                                gamma_dev.GetDeviceBuffer(),
+                                                                beta_dev.GetDeviceBuffer(),
+                                                                num_rows,
+                                                                current_dim,
+                                                                index_length,
+                                                                epsilon);
+        std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
+                  << std::endl
+                  << std::flush;
+
+        bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
+
+        if(!is_supported)
+        {
+            std::cout << "Runtime parameters are not supported" << std::endl;
+            return;
+        }
+
+        auto invoker_ptr = device_instance.MakeInvokerPointer();
+        float time_ms    = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        bool pass = true;
+        {
+            Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
+            ReferenceInstance ref;
+            auto ref_argument = ref.MakeArgument(out,
+                                                 emb_a,
+                                                 emb_b,
+                                                 emb_c,
+                                                 index_a,
+                                                 index_b,
+                                                 index_c,
+                                                 gamma,
+                                                 beta,
+                                                 num_rows,
+                                                 current_dim,
+                                                 index_length,
+                                                 epsilon);
+            auto ref_invoker  = ref.MakeInvoker();
+            ref_invoker.Run(ref_argument);
+
+            out_dev.FromDevice(out_from_dev.mData.data());
+            pass &= ck::utils::check_err(
+                out_from_dev.mData, out.mData, "Error: Incorrect results", 1e-3, 1e-3);
+        }
+
+        double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
+                            current_dim * sizeof(GammaDataType) +
+                            current_dim * sizeof(BetaDataType);
+        double total_write = current_dim * index_length * sizeof(OutType);
+        double gbps        = (total_read + total_write) / time_ms / 1e6;
+
+        std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
+                  << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
+                  << std::flush;
+    });
+
+    return 0;
+}
--- a/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
+add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp)
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using A0DataType        = F16;
+using B0DataType        = F16;
+using Acc0DataType      = F32;
+using D00DataType       = F16;
+using D01DataType       = F16;
+using B1DataType        = F16;
+using Acc1DataType      = F32;
+using C1ShuffleDataType = F32;
+using D1DataType        = F16;
+using E1DataType        = F16;
+
+using A0Layout  = Row;
+using B0Layout  = Col;
+using D00Layout = Row;
+using D01Layout = Row;
+using B1Layout  = Row;
+using D1Layout  = Row;
+using E1Layout  = Row;
+
+// E = Relu(C + D0 + D1)
+struct AddAddRelu
+{
+    __host__ __device__ void
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + d0 + d1;
+
+        ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x);
+    }
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x);
+    }
+};
+
+// E = Gelu(C + D0 + D1)
+struct AddAddGelu
+{
+    __host__ __device__ void
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + d0 + d1;
+
+        ck::tensor_operation::element_wise::Gelu{}.template operator()<ck::half_t, ck::half_t>(e,
+                                                                                               x);
+    }
+
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::Gelu{}.template operator()<float, float>(e, x);
+    }
+};
+
+// E = FastGelu(C + D0 + D1)
+struct AddAddFastGelu
+{
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(e, x);
+    }
+};
+
+using A0ElementOp   = PassThrough;
+using B0ElementOp   = PassThrough;
+using CDE0ElementOp = AddAddRelu;
+using A1ElementOp   = PassThrough;
+using B1ElementOp   = PassThrough;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr bool PadGemm0M = false;
+static constexpr bool PadGemm0N = false;
+static constexpr bool PadGemm0K = false;
+static constexpr bool PadGemm1N = false;
+static constexpr bool PadGemm1K = false;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<
+        A0Layout,
+        B0Layout,
+        ck::Tuple<D00Layout, D01Layout>,
+        B1Layout,
+        ck::Tuple<D1Layout>,
+        E1Layout,
+        A0DataType,
+        B0DataType,
+        Acc0DataType,
+        ck::Tuple<D00DataType, D01DataType>,
+        B1DataType,
+        Acc1DataType,
+        C1ShuffleDataType,
+        ck::Tuple<D1DataType>,
+        E1DataType,
+        A0ElementOp,
+        B0ElementOp,
+        CDE0ElementOp,
+        B1ElementOp,
+        CDE1ElementOp,
+        PadGemm0M,
+        PadGemm0N,
+        PadGemm0K,
+        PadGemm1N,
+        PadGemm1K,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<8, 32, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M              = 1024;
+    ck::index_t N              = 1024;
+    ck::index_t K              = 64;
+    ck::index_t O              = 128;
+    ck::index_t BatchCount     = 4;
+    ck::index_t StrideA0       = -1;
+    ck::index_t StrideB0       = -1;
+    ck::index_t StrideD00      = -1;
+    ck::index_t StrideD01      = -1;
+    ck::index_t StrideB1       = -1;
+    ck::index_t StrideD1       = -1;
+    ck::index_t StrideE1       = -1;
+    ck::index_t BatchStrideA0  = -1;
+    ck::index_t BatchStrideB0  = -1;
+    ck::index_t BatchStrideD00 = -1;
+    ck::index_t BatchStrideD01 = -1;
+    ck::index_t BatchStrideB1  = -1;
+    ck::index_t BatchStrideD1  = -1;
+    ck::index_t BatchStrideE1  = -1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+    }
+    else if(argc == 23)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+
+        StrideA0  = std::stoi(argv[9]);
+        StrideB0  = std::stoi(argv[10]);
+        StrideD00 = std::stoi(argv[11]);
+        StrideD01 = std::stoi(argv[12]);
+        StrideB1  = std::stoi(argv[13]);
+        StrideD1  = std::stoi(argv[14]);
+        StrideE1  = std::stoi(argv[15]);
+
+        BatchStrideA0  = std::stoi(argv[16]);
+        BatchStrideB0  = std::stoi(argv[17]);
+        BatchStrideD00 = std::stoi(argv[18]);
+        BatchStrideD01 = std::stoi(argv[19]);
+        BatchStrideB1  = std::stoi(argv[20]);
+        BatchStrideD1  = std::stoi(argv[21]);
+        BatchStrideE1  = std::stoi(argv[22]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 8: M, N, K, O, Batch\n");
+        printf(
+            "arg9 to 15: StrideA0, StrideB0, StrideD00, StrideD01, StrideB1, StrideD1, StrideE1\n");
+        printf("arg16 to 22: BatchStrideA0, BatchStrideB0, BatchStrideD00, BatchStrideD01, "
+               "BatchStrideB1, BatchStrideD1, BatchStrideE1 \n");
+        exit(0);
+    }
+
+    const int DefaultStrideA0  = ck::is_same_v<A0Layout, Row> ? K : M;
+    const int DefaultStrideB0  = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideD00 = ck::is_same_v<D00Layout, Row> ? N : M;
+    const int DefaultStrideD01 = ck::is_same_v<D01Layout, Row> ? N : M;
+    const int DefaultStrideB1  = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideD1  = ck::is_same_v<D1Layout, Row> ? O : M;
+    const int DefaultStrideE1  = ck::is_same_v<E1Layout, Row> ? O : M;
+
+    StrideA0  = (StrideA0 < 0) ? DefaultStrideA0 : StrideA0;
+    StrideB0  = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideD00 = (StrideD00 < 0) ? DefaultStrideD00 : StrideD00;
+    StrideD01 = (StrideD01 < 0) ? DefaultStrideD01 : StrideD01;
+    StrideB1  = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideD1  = (StrideD1 < 0) ? DefaultStrideD1 : StrideD1;
+    StrideE1  = (StrideE1 < 0) ? DefaultStrideE1 : StrideE1;
+
+    const int DefaultBatchStrideA0  = (ck::is_same_v<A0Layout, Col> ? K : M) * StrideA0;
+    const int DefaultBatchStrideB0  = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideD00 = (ck::is_same_v<D00Layout, Col> ? N : M) * StrideD00;
+    const int DefaultBatchStrideD01 = (ck::is_same_v<D01Layout, Col> ? N : M) * StrideD01;
+    const int DefaultBatchStrideB1  = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideD1  = (ck::is_same_v<D1Layout, Col> ? O : M) * StrideD1;
+    const int DefaultBatchStrideE1  = (ck::is_same_v<E1Layout, Col> ? O : M) * StrideE1;
+
+    BatchStrideA0  = BatchStrideA0 < 0 ? DefaultBatchStrideA0 : BatchStrideA0;
+    BatchStrideB0  = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideD00 = BatchStrideD00 < 0 ? DefaultBatchStrideD00 : BatchStrideD00;
+    BatchStrideD01 = BatchStrideD01 < 0 ? DefaultBatchStrideD01 : BatchStrideD01;
+    BatchStrideB1  = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideD1  = BatchStrideD1 < 0 ? DefaultBatchStrideD1 : BatchStrideD1;
+    BatchStrideE1  = BatchStrideE1 < 0 ? DefaultBatchStrideE1 : BatchStrideE1;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+
+    // E_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<A0DataType> a0_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA0, BatchStrideA0, A0Layout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<D00DataType> d00_g_m_n(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideD00, BatchStrideD00, D00Layout{}));
+    Tensor<D01DataType> d01_g_m_n(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideD01, BatchStrideD01, D01Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<D1DataType> d1_g_m_o(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideD1, BatchStrideD1, D1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+
+    std::cout << "a0_g_m_k: " << a0_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "d00_g_m_n: " << d00_g_m_n.mDesc
+              << " size: " << d00_g_m_n.mDesc.GetElementSpaceSize() << std::endl;
+    std::cout << "d01_g_m_n: " << d01_g_m_n.mDesc
+              << " size: " << d01_g_m_n.mDesc.GetElementSpaceSize() << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "e1_g_m_o: " << e1_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 3});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 3});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_2<D00DataType>{-2, 3});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_2<D01DataType>{-2, 3});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 3});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 3});
+        break;
+    case 2:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_3<D00DataType>{0.0, 1.0});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_3<D01DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        break;
+    default:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_1<D00DataType>{1});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_1<D01DataType>{1});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_1<D1DataType>{1});
+    }
+
+    DeviceMem a0_g_m_k_device_buf(sizeof(A0DataType) * a0_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem d00_g_m_n_device_buf(sizeof(D00DataType) * d00_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d01_g_m_n_device_buf(sizeof(D01DataType) * d01_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem e1_g_m_o_device_buf(sizeof(E1DataType) *
+                                  e1_g_m_o_device_result.mDesc.GetElementSize());
+    DeviceMem d1_g_m_o_device_buf(sizeof(D1DataType) * d1_g_m_o.mDesc.GetElementSpaceSize());
+
+    a0_g_m_k_device_buf.ToDevice(a0_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    d00_g_m_n_device_buf.ToDevice(d00_g_m_n.mData.data());
+    d01_g_m_n_device_buf.ToDevice(d01_g_m_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+    d1_g_m_o_device_buf.ToDevice(d1_g_m_o.mData.data());
+
+    auto a0_element_op   = A0ElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto cde0_element_op = CDE0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto cde1_element_op = CDE1ElementOp{};
+
+    // do GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    auto argument =
+        gemm.MakeArgument(static_cast<A0DataType*>(a0_g_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+                          std::array<const void*, 2>{d00_g_m_n_device_buf.GetDeviceBuffer(),
+                                                     d01_g_m_n_device_buf.GetDeviceBuffer()},
+                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+                          std::array<const void*, 1>{d1_g_m_o_device_buf.GetDeviceBuffer()},
+                          static_cast<E1DataType*>(e1_g_m_o_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          O,
+                          BatchCount,
+                          StrideA0,
+                          StrideB0,
+                          std::array<ck::index_t, 2>{StrideD00, StrideD01},
+                          StrideB1,
+                          std::array<ck::index_t, 1>{StrideD1},
+                          StrideE1,
+                          BatchStrideA0,
+                          BatchStrideB0,
+                          std::array<ck::index_t, 2>{BatchStrideD00, BatchStrideD01},
+                          BatchStrideB1,
+                          std::array<ck::index_t, 1>{BatchStrideD1},
+                          BatchStrideE1,
+                          a0_element_op,
+                          b0_element_op,
+                          cde0_element_op,
+                          b1_element_op,
+                          cde1_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype =
+        (sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(D00DataType) * N +
+         sizeof(D01DataType) * N + sizeof(B1DataType) * N * O + sizeof(E1DataType) * M * O +
+         sizeof(D1DataType) * O) *
+        BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data());
+
+    if(do_verification)
+    {
+        using ReferenceGemm0Instance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<A0DataType,
+                                                             B0DataType,
+                                                             Acc0DataType,
+                                                             Acc0DataType,
+                                                             A0ElementOp,
+                                                             B0ElementOp,
+                                                             PassThrough>;
+
+        using ReferenceGemm1Instance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<Acc0DataType,
+                                                             B1DataType,
+                                                             Acc1DataType,
+                                                             Acc1DataType,
+                                                             PassThrough,
+                                                             B1ElementOp,
+                                                             PassThrough>;
+
+        // Output of Gemm0 is input A of Gemm1
+        Tensor<Acc0DataType> c0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        Tensor<Acc0DataType> e0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        Tensor<Acc1DataType> c1_g_m_o(f_host_tensor_descriptor(BatchCount, M, O, O, M * O, Row{}));
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a0_g_m_k, b0_g_k_n, c0_g_m_n, a0_element_op, b0_element_op, PassThrough{});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        // bias+bias+relu
+        e0_g_m_n.ForEach([&](auto&, auto idx) {
+            cde0_element_op(e0_g_m_n(idx), c0_g_m_n(idx), d00_g_m_n(idx), d01_g_m_n(idx));
+        });
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            e0_g_m_n, b1_g_n_o, c1_g_m_o, PassThrough{}, b1_element_op, PassThrough{});
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        // bias
+        e1_g_m_o_host_result.ForEach([&](auto&, auto idx) {
+            cde1_element_op(e1_g_m_o_host_result(idx), c1_g_m_o(idx), d1_g_m_o(idx));
+        });
+
+        return ck::utils::check_err(e1_g_m_o_device_result.mData, e1_g_m_o_host_result.mData) ? 0
+                                                                                              : 1;
+    }
+
+    return 0;
+}
--- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
+add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+
+if(USE_BITINT_EXTENSION_INT4)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
+endif(USE_BITINT_EXTENSION_INT4)
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = ck::bhalf_t;
+using Wei0DataType      = ck::bhalf_t;
+using Acc0DataType      = float;
+using Wei1DataType      = ck::bhalf_t;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = ck::bhalf_t;
+
+// This is used for reference code
+using Out0DataType = ck::bhalf_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = ck::half_t;
+using Wei0DataType      = ck::half_t;
+using Acc0DataType      = float;
+using Wei1DataType      = ck::half_t;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = ck::half_t;
+
+// This is used for reference code
+using Out0DataType = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = float;
+using Wei0DataType      = float;
+using Acc0DataType      = float;
+using Wei1DataType      = float;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = float;
+
+// This is used for reference code
+using Out0DataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        16,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        16,          // Gemm1KPerBlock
+        4,           // AK1
+        4,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        2,
+        2,
+        true,
+        1,               // CShuffleMXdlPerWavePerShuffle
+        2,               // CShuffleNXdlPerWavePerShuffle
+        S<1, 16, 1, 16>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4>;              // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType        = ck::int4_t;
+using Wei0DataType       = ck::int4_t;
+using KernelIn0DataType  = int8_t;
+using KernelWei0DataType = int8_t;
+using Acc0DataType       = int32_t;
+using Wei1DataType       = ck::int4_t;
+using KernelWei1DataType = int8_t;
+using Acc1DataType       = int32_t;
+using C1ShuffleDataType  = int32_t;
+using Out1DataType       = ck::int4_t;
+using KernelOut1DataType = int8_t;
+
+// This is used for reference code
+using Out0DataType = ck::int4_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,                // ALayout
+        Col,                // B0Layout
+        Col,                // B1Layout
+        Row,                // CLayout
+        KernelIn0DataType,  // ADataType,
+        KernelWei0DataType, // B0DataType,
+        KernelWei1DataType, // B1DataType,
+        KernelOut1DataType, // CDataType,
+        Acc0DataType,       // AccDataType,
+        C1ShuffleDataType,  // CShuffleDataType,
+        In0ElementOp,       // AElementOp,
+        Wei0ElementOp,      // B0ElementOp,
+        Out0ElementOp,      // Acc0ElementOp,
+        Wei1ElementOp,      // B1ElementOp,
+        Out1ElementOp,      // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        64,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        64,          // Gemm1KPerBlock
+        16,          // AK1
+        16,          // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#define BUILD_INT4_EXAMPLE
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = int8_t;
+using Wei0DataType      = int8_t;
+using Acc0DataType      = int32_t;
+using Wei1DataType      = int8_t;
+using Acc1DataType      = int32_t;
+using C1ShuffleDataType = int32_t;
+using Out1DataType      = int8_t;
+
+// This is used for reference code
+using Out0DataType = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        64,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        64,          // Gemm1KPerBlock
+        16,          // AK1
+        16,          // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <ck::index_t NDimSpatial,
+          typename In0DataType,
+          typename Wei0DataType,
+          typename Out0DataType,
+          typename Wei1DataType,
+          typename Out1DataType,
+          typename In0ElementOp,
+          typename Wei0ElementOp,
+          typename Out0ElementOp,
+          typename Wei1ElementOp,
+          typename Out1ElementOp,
+          typename DeviceOpInstance>
+bool run_grouped_conv_conv_fwd(bool do_verification,
+                               int init_method,
+                               bool time_kernel,
+                               const ck::utils::conv::ConvParam& conv0_param,
+                               const ck::utils::conv::ConvParam& conv1_param,
+                               const HostTensorDescriptor& in0_g_n_c_wis_desc,
+                               const HostTensorDescriptor& wei0_g_k_c_xs_desc,
+                               const HostTensorDescriptor& out0_g_n_k_wos_desc,
+                               const HostTensorDescriptor& wei1_g_k_c_xs_desc,
+                               const HostTensorDescriptor& out1_g_n_k_wos_desc,
+                               const In0ElementOp& in0_element_op,
+                               const Wei0ElementOp& wei0_element_op,
+                               const Wei1ElementOp& wei1_element_op,
+                               const Out0ElementOp& out0_element_op,
+                               const Out1ElementOp& out1_element_op)
+{
+    Tensor<In0DataType> in0(in0_g_n_c_wis_desc);
+    Tensor<Wei0DataType> wei0(wei0_g_k_c_xs_desc);
+    Tensor<Wei1DataType> wei1(wei1_g_k_c_xs_desc);
+    Tensor<Out1DataType> out1_host(out1_g_n_k_wos_desc);
+    Tensor<Out1DataType> out1_device(out1_g_n_k_wos_desc);
+
+    std::cout << "in0: " << in0.mDesc << std::endl;
+    std::cout << "wei0: " << wei0.mDesc << std::endl;
+    std::cout << "wei1: " << wei1.mDesc << std::endl;
+    std::cout << "out1: " << out1_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in0.GenerateTensorValue(GeneratorTensor_2<In0DataType>{-5, 5});
+        wei0.GenerateTensorValue(GeneratorTensor_2<Wei0DataType>{-5, 5});
+        wei1.GenerateTensorValue(GeneratorTensor_2<Wei1DataType>{-5, 5});
+        break;
+    default:
+        in0.GenerateTensorValue(GeneratorTensor_3<In0DataType>{0.0, 1.0});
+        wei0.GenerateTensorValue(GeneratorTensor_3<Wei0DataType>{-0.5, 0.5});
+        wei1.GenerateTensorValue(GeneratorTensor_3<Wei1DataType>{-0.5, 0.5});
+    }
+
+#ifdef BUILD_INT4_EXAMPLE
+    DeviceMem in0_device_buf(sizeof(KernelIn0DataType) * in0.mDesc.GetElementSpaceSize());
+    DeviceMem wei0_device_buf(sizeof(KernelWei0DataType) * wei0.mDesc.GetElementSpaceSize());
+    DeviceMem wei1_device_buf(sizeof(KernelWei1DataType) * wei1.mDesc.GetElementSpaceSize());
+    DeviceMem out1_device_buf(sizeof(KernelOut1DataType) * out1_device.mDesc.GetElementSpaceSize());
+
+    const Tensor<KernelIn0DataType> in0_converted(in0);
+    const Tensor<KernelWei0DataType> wei0_converted(wei0);
+    const Tensor<KernelWei1DataType> wei1_converted(wei1);
+
+    in0_device_buf.ToDevice(in0_converted.mData.data());
+    wei0_device_buf.ToDevice(wei0_converted.mData.data());
+    wei1_device_buf.ToDevice(wei1_converted.mData.data());
+#else
+    DeviceMem in0_device_buf(sizeof(In0DataType) * in0.mDesc.GetElementSpaceSize());
+    DeviceMem wei0_device_buf(sizeof(Wei0DataType) * wei0.mDesc.GetElementSpaceSize());
+    DeviceMem wei1_device_buf(sizeof(Wei1DataType) * wei1.mDesc.GetElementSpaceSize());
+    DeviceMem out1_device_buf(sizeof(Out1DataType) * out1_device.mDesc.GetElementSpaceSize());
+
+    in0_device_buf.ToDevice(in0.mData.data());
+    wei0_device_buf.ToDevice(wei0.mData.data());
+    wei1_device_buf.ToDevice(wei1.mData.data());
+#endif
+
+    std::array<ck::index_t, NDimSpatial + 3> a0_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a0_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b0_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b0_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b1_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b1_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e1_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e1_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv0_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv0_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input0_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input0_right_pads{};
+    std::array<ck::index_t, NDimSpatial> conv1_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv1_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input1_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input1_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+    copy(in0_g_n_c_wis_desc.GetLengths(), a0_g_n_c_wis_lengths);
+    copy(in0_g_n_c_wis_desc.GetStrides(), a0_g_n_c_wis_strides);
+    copy(wei0_g_k_c_xs_desc.GetLengths(), b0_g_k_c_xs_lengths);
+    copy(wei0_g_k_c_xs_desc.GetStrides(), b0_g_k_c_xs_strides);
+    copy(wei1_g_k_c_xs_desc.GetLengths(), b1_g_k_c_xs_lengths);
+    copy(wei1_g_k_c_xs_desc.GetStrides(), b1_g_k_c_xs_strides);
+    copy(out1_g_n_k_wos_desc.GetLengths(), e1_g_n_k_wos_lengths);
+    copy(out1_g_n_k_wos_desc.GetStrides(), e1_g_n_k_wos_strides);
+    copy(conv0_param.conv_filter_strides_, conv0_filter_strides);
+    copy(conv0_param.conv_filter_dilations_, conv0_filter_dilations);
+    copy(conv0_param.input_left_pads_, input0_left_pads);
+    copy(conv0_param.input_right_pads_, input0_right_pads);
+    copy(conv1_param.conv_filter_strides_, conv1_filter_strides);
+    copy(conv1_param.conv_filter_dilations_, conv1_filter_dilations);
+    copy(conv1_param.input_left_pads_, input1_left_pads);
+    copy(conv1_param.input_right_pads_, input1_right_pads);
+
+    // do Conv using GEMM, only works for 1x1 conv for now
+    const ck::index_t gemm_batch = a0_g_n_c_wis_lengths[0];
+
+    const ck::index_t gemm0_m_length =
+        e1_g_n_k_wos_lengths[1] * std::accumulate(e1_g_n_k_wos_lengths.begin() + 3,
+                                                  e1_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
+                                                  ck::index_t{1},
+                                                  std::multiplies<ck::index_t>{});
+
+    const ck::index_t gemm0_n_length = b0_g_k_c_xs_lengths[1];
+
+    const ck::index_t gemm0_k_length =
+        std::accumulate(b0_g_k_c_xs_lengths.begin() + 2,
+                        b0_g_k_c_xs_lengths.begin() + 2 + NDimSpatial + 1,
+                        ck::index_t{1},
+                        std::multiplies<ck::index_t>{});
+
+    const ck::index_t gemm1_n_length = b1_g_k_c_xs_lengths[1];
+
+    //
+    const ck::index_t a0_stride = a0_g_n_c_wis_strides[2 + NDimSpatial];
+    const ck::index_t b0_stride = b0_g_k_c_xs_strides[2 + NDimSpatial];
+    const ck::index_t b1_stride = b1_g_k_c_xs_strides[2 + NDimSpatial];
+    const ck::index_t e1_stride = e1_g_n_k_wos_strides[2 + NDimSpatial];
+
+    //
+    const ck::index_t a0_batch_stride = a0_g_n_c_wis_strides[0];
+    const ck::index_t b0_batch_stride = b0_g_k_c_xs_strides[0];
+    const ck::index_t b1_batch_stride = b1_g_k_c_xs_strides[0];
+    const ck::index_t e1_batch_stride = e1_g_n_k_wos_strides[0];
+
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument  = device_op.MakeArgument(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelIn0DataType*>(in0_device_buf.GetDeviceBuffer()),
+        static_cast<KernelWei0DataType*>(wei0_device_buf.GetDeviceBuffer()),
+        static_cast<KernelWei1DataType*>(wei1_device_buf.GetDeviceBuffer()),
+        static_cast<KernelOut1DataType*>(out1_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<In0DataType*>(in0_device_buf.GetDeviceBuffer()),
+        static_cast<Wei0DataType*>(wei0_device_buf.GetDeviceBuffer()),
+        static_cast<Wei1DataType*>(wei1_device_buf.GetDeviceBuffer()),
+        static_cast<Out1DataType*>(out1_device_buf.GetDeviceBuffer()),
+#endif
+        gemm0_m_length,
+        gemm0_n_length,
+        gemm0_k_length,
+        gemm1_n_length,
+        gemm_batch,
+        a0_stride,
+        b0_stride,
+        b1_stride,
+        e1_stride,
+        a0_batch_stride,
+        b0_batch_stride,
+        b1_batch_stride,
+        e1_batch_stride,
+        in0_element_op,
+        wei0_element_op,
+        out0_element_op,
+        wei1_element_op,
+        out1_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv0_param.GetFlops() + conv1_param.GetFlops();
+    std::size_t num_btype = conv0_param.template GetInputByte<In0DataType>() +
+                            conv0_param.template GetWeightByte<Wei0DataType>() +
+                            conv1_param.template GetWeightByte<Wei1DataType>() +
+                            conv1_param.template GetOutputByte<Out1DataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+        Tensor<Out0DataType> out0_host(out0_g_n_k_wos_desc);
+
+        auto ref_conv0 = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                      In0DataType,
+                                                                      Wei0DataType,
+                                                                      Out0DataType,
+                                                                      In0ElementOp,
+                                                                      Wei0ElementOp,
+                                                                      Out0ElementOp>();
+
+        auto ref_conv1 = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                      Out0DataType,
+                                                                      Wei1DataType,
+                                                                      Out1DataType,
+                                                                      PassThrough,
+                                                                      Wei1ElementOp,
+                                                                      Out1ElementOp>();
+
+        auto ref_conv0_invoker = ref_conv0.MakeInvoker();
+        auto ref_conv1_invoker = ref_conv1.MakeInvoker();
+
+        auto ref_conv0_argument = ref_conv0.MakeArgument(in0,
+                                                         wei0,
+                                                         out0_host,
+                                                         conv0_param.conv_filter_strides_,
+                                                         conv0_param.conv_filter_dilations_,
+                                                         conv0_param.input_left_pads_,
+                                                         conv0_param.input_right_pads_,
+                                                         in0_element_op,
+                                                         wei0_element_op,
+                                                         out0_element_op);
+
+        auto ref_conv1_argument = ref_conv1.MakeArgument(out0_host,
+                                                         wei1,
+                                                         out1_host,
+                                                         conv1_param.conv_filter_strides_,
+                                                         conv1_param.conv_filter_dilations_,
+                                                         conv1_param.input_left_pads_,
+                                                         conv1_param.input_right_pads_,
+                                                         out0_element_op,
+                                                         wei1_element_op,
+                                                         out1_element_op);
+
+        ref_conv0_invoker.Run(ref_conv0_argument);
+        ref_conv1_invoker.Run(ref_conv1_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        Tensor<KernelOut1DataType> out1_device_converted(out1_host.mDesc);
+
+        out1_device_buf.FromDevice(out1_device_converted.mData.data());
+
+        out1_device = out1_device_converted.CopyAsType<Out1DataType>();
+#else
+        out1_device_buf.FromDevice(out1_device.mData.data());
+#endif
+
+        return ck::utils::check_err(
+            out1_device.mData, out1_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return true;
+}
+
+bool run_grouped_conv_conv_fwd_example(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv0_param{
+        2, 1, 128, 512, 128, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+
+    ck::utils::conv::ConvParam conv1_param{
+        2, 1, 128, 128, 512, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    const auto in0_element_op  = In0ElementOp{};
+    const auto wei0_element_op = Wei0ElementOp{};
+    const auto wei1_element_op = Wei1ElementOp{};
+    const auto out0_element_op = Out0ElementOp{};
+    const auto out1_element_op = Out1ElementOp{};
+
+    const auto run = [&](auto ndim_spatial,
+                         auto in0_layout,
+                         auto wei0_layout,
+                         auto wei1_layout,
+                         auto out1_layout) {
+        constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
+
+        using In0Layout  = decltype(in0_layout);
+        using Wei0Layout = decltype(wei0_layout);
+        using Wei1Layout = decltype(wei1_layout);
+        using Out1Layout = decltype(out1_layout);
+
+        const auto in0_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<In0Layout>(
+                conv0_param);
+
+        const auto wei0_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<Wei0Layout>(
+                conv0_param);
+
+        // out0 doesn't physical exist, any layout for host verification is OK
+        const auto out0_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<Out1Layout>(
+                conv0_param);
+
+        const auto wei1_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<Wei1Layout>(
+                conv1_param);
+
+        const auto out1_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<Out1Layout>(
+                conv1_param);
+
+        return run_grouped_conv_conv_fwd<ndim_spatial_value,
+                                         In0DataType,
+                                         Wei0DataType,
+                                         Out0DataType,
+                                         Wei1DataType,
+                                         Out1DataType,
+                                         In0ElementOp,
+                                         Wei0ElementOp,
+                                         Out0ElementOp,
+                                         Wei1ElementOp,
+                                         Out1ElementOp,
+                                         DeviceBatchedGemmGemmInstance>(do_verification,
+                                                                        init_method,
+                                                                        time_kernel,
+                                                                        conv0_param,
+                                                                        conv1_param,
+                                                                        in0_g_n_c_wis_desc,
+                                                                        wei0_g_k_c_xs_desc,
+                                                                        out0_g_n_k_wos_desc,
+                                                                        wei1_g_k_c_xs_desc,
+                                                                        out1_g_n_k_wos_desc,
+                                                                        in0_element_op,
+                                                                        wei0_element_op,
+                                                                        wei1_element_op,
+                                                                        out0_element_op,
+                                                                        out1_element_op);
+    };
+
+    namespace ctc = ck::tensor_layout::convolution;
+
+    if(conv0_param.num_dim_spatial_ == 1)
+    {
+        return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GKXC{}, ctc::GNWK{});
+    }
+    else if(conv0_param.num_dim_spatial_ == 2)
+    {
+        return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GKYXC{}, ctc::GNHWK{});
+    }
+    else if(conv0_param.num_dim_spatial_ == 3)
+    {
+        return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
+    }
+
+    return true;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory(02_gemm_bilinear)
 add_subdirectory(03_gemm_bias_relu)
 add_subdirectory(04_gemm_add_add_fastgelu)
 add_subdirectory(09_convnd_fwd)
+add_subdirectory(10_convnd_fwd_multiple_d_multiple_reduce)
 add_subdirectory(12_reduce)
 add_subdirectory(13_pool2d_fwd)
 add_subdirectory(14_gemm_xdl_requant_relu_requant)
@@ -38,7 +39,7 @@ add_subdirectory(20_convnd_bwd_weight)
 add_subdirectory(21_gemm_layernorm)
 add_subdirectory(22_cgemm)
 add_subdirectory(23_softmax)
-add_subdirectory(24_batched_gemm_e_permute)
+add_subdirectory(24_batched_gemm)
 add_subdirectory(25_gemm_bias_e_permute)
 add_subdirectory(26_contraction)
 add_subdirectory(27_layernorm)
@@ -46,4 +47,10 @@ add_subdirectory(28_grouped_gemm_bias_e_permute)
 add_subdirectory(29_batched_gemm_bias_e_permute)
 add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
 add_subdirectory(31_batched_gemm_gemm)
-add_subdirectory(32_batched_gemm_softmax_gemm)
+add_subdirectory(32_batched_gemm_scale_softmax_gemm)
+add_subdirectory(33_multiple_reduce)
+add_subdirectory(34_batchnorm)
+add_subdirectory(35_splitK_gemm)
+add_subdirectory(36_sparse_embedding)
+add_subdirectory(37_batched_gemm_add_add_relu_gemm_add)
+add_subdirectory(41_grouped_conv_conv_fwd)
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -149,6 +149,17 @@
 // workaround: compiler gnerating inefficient ds_write instructions
 #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1

+// (gfx908 only) workaround: compiler crash in fused kernels on mainline #9110; #10738 seems ok
+// error message was "fatal error: error in backend: Error while trying to spill VGPR0 from class
+// VGPR_32: Cannot scavenge register without an emergency spill slot!"
+// this fall back to less ideal way of handle NPadding in fused attention kernel
+#ifdef __gfx908__
+#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 1
+#else
+// for __gfx90a__, ...
+#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 0
+#endif // __gfx908__
+
 // workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
 // tuning parameter
 #define CK_WORKAROUND_SWDEV_325164 0