Merge branch 'develop' into wavelet_model

b89a88b5 · Adam Osewski · 41d5fca7 · 43c898f6 · b89a88b5 · b89a88b5
Commit b89a88b5 authored Sep 19, 2022 by Adam Osewski
20 changed files
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -9,37 +9,41 @@

 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_common_util.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"

-using namespace ck;
 using namespace ck::tensor_operation::device;

 using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;
 using AccDataType = float;

+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
 constexpr int Rank         = 3;
 constexpr int NumReduceDim = 1;

-using DeviceInstance = DeviceSoftmax<InDataType,
-                                     AccDataType,
-                                     OutDataType,
-                                     Rank,
-                                     NumReduceDim,
-                                     256, // BlockSize
-                                     8,   // ClusterM
-                                     32,  // ClusterK
-                                     1,   // SliceM
-                                     8,   // SliceK
-                                     1,   // SrcVecDim (0=M, 1=K)
-                                     8,   // SrcScalarPerVector
-                                     8>;  // OutScalarPerVector
+using DeviceInstance = DeviceSoftmaxImpl<InDataType,
+                                         AccDataType,
+                                         OutDataType,
+                                         PassThrough, // InElementwiseOperation
+                                         PassThrough, // AccElementwiseOperation
+                                         Rank,
+                                         NumReduceDim,
+                                         256, // BlockSize
+                                         8,   // ClusterM
+                                         32,  // ClusterK
+                                         1,   // SliceM
+                                         8,   // SliceK
+                                         1,   // SrcVecDim (0=M, 1=K)
+                                         8,   // SrcScalarPerVector
+                                         8>;  // OutScalarPerVector

 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                       {"verify", required_argument, nullptr, 'v'},
@@ -196,7 +200,7 @@ int main(int argc, char* argv[])
    if(args.do_verification)
    {
        using ReferenceInstance =
-            tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+            ck::tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
        ReferenceInstance ref;
        auto ref_arg = ref.MakeArgument(in, out_ref, alpha, beta, reduceDims);
        auto invoker = ref.MakeInvoker();
@@ -220,7 +224,9 @@ int main(int argc, char* argv[])
                                                            &alpha,
                                                            &beta,
                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer());
+                                                            out_dev.GetDeviceBuffer(),
+                                                            PassThrough{},
+                                                            PassThrough{});

    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
    {

--- a/example/24_batched_gemm/CMakeLists.txt
+++ b/example/24_batched_gemm/CMakeLists.txt
+add_custom_target(example_batched_gemm_xdl)
+
+add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp)
+add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_xdl_bfp16 batched_gemm_xdl_bfp16.cpp)
+add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp)
+
+add_dependencies(example_batched_gemm_xdl
+                 example_batched_gemm_xdl_fp32
+                 example_batched_gemm_xdl_fp16
+                 example_batched_gemm_xdl_bfp16
+                 example_batched_gemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp)
+  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4)
+endif()
--- a/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
--- a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
--- a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
--- a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = ck::int4_t;
+using BDataType        = ck::int4_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = ck::int4_t;
+
+using KernelADataType = int8_t;
+using KernelBDataType = int8_t;
+using KernelEDataType = int8_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+    // clang-format off
+        < ALayout,              //ALayout
+          BLayout,              //BLayout
+          DsLayout,             //DsLayout
+          ELayout,              //ELayout
+          KernelADataType,      //ADataType    
+          KernelBDataType,      //BDataType   
+          AccDataType,          //AccDataType
+          CShuffleDataType,     //CShuffleDataType
+          DsDataType,           //DsDataType
+          KernelEDataType,      //EDataType
+          AElementOp,           //AElementwiseOperation
+          BElementOp,           //BElementwiseOperation
+          CDEElementOp,         //CDEElementwiseOperation
+          GemmDefault,          //GEMMSpecialization
+          1,                    // NumGemmKPrefetchStage
+          256,                  // BlockSize
+          256,                  // MPerBlock
+          128,                  // NPerBlock
+          64,                   // KPerBlock
+          16,                   // AK1
+          16,                   // BK1
+          32,                   // MPerXdl
+          32,                   // NPerXdl
+          4,                    // MXdlPerWave
+          2,                    // NXdlPerWave
+          S<4, 64, 1>,          // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+          S<1, 0, 2>,           // ABlockTransfer ThreadCluster ArrangeOrder
+          S<1, 0, 2>,           // ABlockTransfer SrcAccessOrder
+          2,                    // ABlockTransfer SrcVectorDim
+          16,                   // ABlockTransfer SrcScalarPerVector
+          16,                   // ABlockTransfer DstScalarPerVector_K1
+          1,                    // ABlockLdsExtraM
+          S<4, 64, 1>,          // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+          S<1, 0, 2>,           // BBlockTransfer ThreadCluster ArrangeOrder
+          S<1, 0, 2>,           // BBlockTransfer SrcAccessOrder
+          2,                    // BBlockTransfer SrcVectorDim
+          16,                   // BBlockTransfer SrcScalarPerVector
+          16,                   // BBlockTransfer DstScalarPerVector_K1
+          1,                    // BBlockLdsExtraN
+          1,                    // CShuffleMXdlPerWavePerShuffle
+          1,                    // CShuffleNXdlPerWavePerShuffle
+          S<1, 64, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MWaveMPerXdl_NBlock_NWaveNPerXdl
+          16>;                  // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+#define BUILD_INT4_EXAMPLE
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
--- a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = int8_t;
+using BDataType        = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = int8_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
--- a/example/24_batched_gemm/run_batched_gemm_example.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example.inc
+#include <random>
+
+#pragma once
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+    ck::index_t stride_C = N;
+
+    ck::index_t batch_stride_A = M * K;
+    ck::index_t batch_stride_B = K * N;
+    ck::index_t batch_stride_C = M * N;
+
+    ck::index_t batch_count = 16;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+    static_assert(sizeof(ADataType) == sizeof(KernelADataType));
+    static_assert(sizeof(BDataType) == sizeof(KernelBDataType));
+    static_assert(sizeof(EDataType) == sizeof(KernelEDataType));
+#endif
+
+    auto& [M,
+           N,
+           K,
+           stride_A,
+           stride_B,
+           stride_C,
+           batch_stride_A,
+           batch_stride_B,
+           batch_stride_C,
+           batch_count] = problem_size;
+
+    // GEMM shape
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count_, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+#ifdef BUILD_INT4_EXAMPLE
+    Tensor<KernelEDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+#else
+    Tensor<EDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+#endif
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "e_g_m_n: " << e_g_m_n_device_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<KernelADataType> a_g_m_k_converted(a_g_m_k);
+    const Tensor<KernelBDataType> b_g_k_n_converted(b_g_k_n);
+
+    a_device_buf.ToDevice(a_g_m_k_converted.mData.data());
+    b_device_buf.ToDevice(b_g_k_n_converted.mData.data());
+#else
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+#endif
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    // do GEMM
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      c_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      batch_count,
+                                      stride_A,
+                                      stride_B,
+                                      {},
+                                      stride_C,
+                                      batch_stride_A,
+                                      batch_stride_B,
+                                      {},
+                                      batch_stride_C,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool pass = true;
+
+    if(config.do_verification)
+    {
+        c_device_buf.FromDevice(e_g_m_n_device_result.mData.data());
+
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             EDataType,
+                                                             AccDataType,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             CDEElementOp>;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        Tensor<EDataType> e_g_m_n_host_result(
+            f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, e_g_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        const Tensor<EDataType> e_device_result_converted(e_g_m_n_device_result);
+        pass &= ck::utils::check_err(e_device_result_converted.mData, e_g_m_n_host_result.mData);
+
+#else
+        pass = ck::utils::check_err(
+            e_g_m_n_device_result.mData, e_g_m_n_host_result.mData, "Error: Incorrect results c");
+#endif
+    }
+
+    if(config.time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
+        std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
+                                sizeof(BDataType) * batch_count * K * N +
+                                sizeof(EDataType) * batch_count * M * N;
+
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass ? 0 : 1;
+}
+
+bool run_batched_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    std::mt19937 gen(11939);
+    std::uniform_int_distribution<int> dis(0, 15);
+
+    problem_size.M = 256 * (dis(gen) + 1);
+    problem_size.N = 128 * (dis(gen) + 1);
+    problem_size.K = 64 * (dis(gen) + 2);
+
+    problem_size.stride_A = problem_size.K;
+    problem_size.stride_B = problem_size.K;
+    problem_size.stride_C = problem_size.N;
+
+    problem_size.batch_stride_A = problem_size.M * problem_size.K;
+    problem_size.batch_stride_B = problem_size.K * problem_size.N;
+    problem_size.batch_stride_C = problem_size.M * problem_size.N;
+
+    problem_size.batch_count = 16;
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+
+    return run_batched_gemm(problem_size, config);
+}
--- a/example/24_batched_gemm_e_permute/CMakeLists.txt
+++ b/example/24_batched_gemm_e_permute/CMakeLists.txt
-add_example_executable(example_batched_gemm_e_permute_xdl_fp16 batched_gemm_e_permute_xdl_fp16.cpp)
-
--- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)

 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 PRIVATE utility)

 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 PRIVATE utility)

 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 PRIVATE utility)
\ No newline at end of file
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp)
+endif() # USE_BITINT_EXTENSION_INT4
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
@@ -26,13 +26,16 @@ void print_helper_msg()
 }

 template <ck::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
+          typename InKernelDataType,
+          typename WeiKernelDataType,
          typename CShuffleDataType,
-          typename OutDataType,
+          typename OutKernelDataType,
          typename InElementOp,
          typename WeiElementOp,
          typename OutElementOp,
+          typename InUserDataType,
+          typename WeiUserDataType,
+          typename OutUserDataType,
          typename DeviceConvNDFwdInstance>
 int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                       int init_method,
@@ -47,12 +50,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                       const WeiElementOp& wei_element_op,
                                       const OutElementOp& out_element_op)
 {
-    Tensor<InDataType> in(in_g_n_c_wis_desc);
-    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
-    Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
-    Tensor<OutDataType> residual(residual_g_n_k_wos_desc);
-    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
-    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    Tensor<InUserDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
+    Tensor<OutUserDataType> residual(residual_g_n_k_wos_desc);
+    Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);

    std::cout << "in: " << in.mDesc << std::endl;
    std::cout << "wei: " << wei.mDesc << std::endl;
@@ -64,26 +67,38 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    {
    case 0: break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<OutUserDataType>{-5, 5});
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<OutUserDataType>{-0.5, 0.5});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
-    DeviceMem residual_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
-
+    DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutKernelDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    const Tensor<InKernelDataType> in_converted(in);
+    const Tensor<WeiKernelDataType> wei_converted(wei);
+    const Tensor<OutKernelDataType> bias_converted(bias);
+    const Tensor<OutKernelDataType> residual_converted(residual);
+
+    in_device_buf.ToDevice(in_converted.mData.data());
+    wei_device_buf.ToDevice(wei_converted.mData.data());
+    bias_device_buf.ToDevice(bias_converted.mData.data());
+    residual_device_buf.ToDevice(residual_converted.mData.data());
+#else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    bias_device_buf.ToDevice(bias.mData.data());
    residual_device_buf.ToDevice(residual.mData.data());
+#endif //  CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4

    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
@@ -154,7 +169,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

    std::size_t flop      = conv_param.GetFlops();
-    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();

    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
    float gb_per_sec = num_btype / 1.E6 / avg_time;
@@ -168,8 +183,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);

        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
-                                                                     InDataType,
-                                                                     WeiDataType,
+                                                                     InUserDataType,
+                                                                     WeiUserDataType,
                                                                     CShuffleDataType,
                                                                     InElementOp,
                                                                     WeiElementOp,
@@ -196,10 +211,22 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,

        out_device_buf.FromDevice(out_device.mData.data());

+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        const Tensor<OutUserDataType> out_device_converted(out_device);
+
+        return ck::utils::check_err(out_device_converted.mData,
+                                    out_host.mData,
+                                    "Error: incorrect results!",
+                                    1e-5f,
+                                    1e-4f)
+                   ? 0
+                   : 1;
+#else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
        return ck::utils::check_err(
                   out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
                   ? 0
                   : 1;
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
    }

    return 0;

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = ck::bhalf_t;
-using WeiDataType      = ck::bhalf_t;
-using AccDataType      = float;
-using CShuffleDataType = float;
-using BiasDataType     = ck::bhalf_t;
-using ResidualDataType = ck::bhalf_t;
-using OutDataType      = ck::bhalf_t;
+// kernel data types
+using InKernelDataType       = ck::bhalf_t;
+using WeiKernelDataType      = ck::bhalf_t;
+using AccDataType            = float;
+using CShuffleDataType       = float;
+using BiasKernelDataType     = ck::bhalf_t;
+using ResidualKernelDataType = ck::bhalf_t;
+using OutKernelDataType      = ck::bhalf_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = ck::half_t;
-using WeiDataType      = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
-using BiasDataType     = ck::half_t;
-using ResidualDataType = ck::half_t;
-using OutDataType      = ck::half_t;
+// kernel data types
+using InKernelDataType       = ck::half_t;
+using WeiKernelDataType      = ck::half_t;
+using AccDataType            = float;
+using CShuffleDataType       = ck::half_t;
+using BiasKernelDataType     = ck::half_t;
+using ResidualKernelDataType = ck::half_t;
+using OutKernelDataType      = ck::half_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -157,9 +163,9 @@ int main(int argc, char* argv[])
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
-                0,             // k
-                1,             // c
-                0              // x
+                0,             // n
+                1,             // k
+                0              // wo
            });

        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = float;
-using WeiDataType      = float;
-using AccDataType      = float;
-using CShuffleDataType = float;
-using BiasDataType     = float;
-using ResidualDataType = float;
-using OutDataType      = float;
+// kernel data types
+using InKernelDataType       = float;
+using WeiKernelDataType      = float;
+using AccDataType            = float;
+using CShuffleDataType       = float;
+using BiasKernelDataType     = float;
+using ResidualKernelDataType = float;
+using OutKernelDataType      = float;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+// kernel data types
+using InKernelDataType       = int8_t;
+using WeiKernelDataType      = int8_t;
+using AccDataType            = int32_t;
+using CShuffleDataType       = int8_t;
+using BiasKernelDataType     = int8_t;
+using ResidualKernelDataType = int8_t;
+using OutKernelDataType      = int8_t;
+
+// tensor data types
+using InUserDataType  = ck::int4_t;
+using WeiUserDataType = ck::int4_t;
+using OutUserDataType = ck::int4_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename ResidualLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, ResidualLayout>,
+        OutLayout,
+        InKernelDataType,
+        WeiKernelDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
+
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+
+    print_helper_msg();
+
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // conventional group conv definition
+    // G = 2
+    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
+    // [K, C,  Y,  X] =  [512, 192,  3,  3]
+    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
+    // CK group conv definition
+    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
+    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
+    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout       = ctc::G_NW_C;
+        using WeiLayout      = ctc::G_K_X_C;
+        using BiasLayout     = ctc::G_NW_K;
+        using ResidualLayout = ctc::G_NW_K;
+        using OutLayout      = ctc::G_NW_K;
+
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                        // g
+                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                    // c
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+
+        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+
+        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+        return run_grouped_conv_fwd_bias_relu_add<1,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
+                                                  CShuffleDataType,
+                                                  OutKernelDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
+                                                  DeviceGroupedConvNDFwdInstance<1,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout       = ctc::G_NHW_C;
+        using WeiLayout      = ctc::G_K_YX_C;
+        using BiasLayout     = ctc::G_NHW_K;
+        using ResidualLayout = ctc::G_NHW_K;
+        using OutLayout      = ctc::G_NHW_K;
+
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                    // n
+                1,                                                                    // c
+                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+        const auto wei_g_k_c_xs_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.K_,
+                                  conv_param.C_,
+                                  conv_param.filter_spatial_lengths_[0],
+                                  conv_param.filter_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                                     conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
+                                     1,                                                         // c
+                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
+                                     conv_param.C_                                              // x
+                                 });
+
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+        return run_grouped_conv_fwd_bias_relu_add<2,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
+                                                  CShuffleDataType,
+                                                  OutKernelDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
+                                                  DeviceGroupedConvNDFwdInstance<2,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout       = ctc::G_NDHW_C;
+        using WeiLayout      = ctc::G_K_ZYX_C;
+        using BiasLayout     = ctc::G_NDHW_K;
+        using ResidualLayout = ctc::G_NDHW_K;
+        using OutLayout      = ctc::G_NDHW_K;
+
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1],
+             conv_param.input_spatial_lengths_[2]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                        // c
+                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                    // di
+                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1],
+             conv_param.filter_spatial_lengths_[2]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
+                1,                                                         // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_,                                     // z
+                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1],
+             conv_param.output_spatial_lengths_[2]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                         // k
+                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.K_,                                     // do
+                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+        return run_grouped_conv_fwd_bias_relu_add<3,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
+                                                  CShuffleDataType,
+                                                  OutKernelDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
+                                                  DeviceGroupedConvNDFwdInstance<3,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+
+    return 0;
+}
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
@@ -7,13 +7,19 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

-using InDataType       = int8_t;
-using WeiDataType      = int8_t;
-using AccDataType      = int32_t;
-using CShuffleDataType = int8_t;
-using BiasDataType     = int8_t;
-using ResidualDataType = int8_t;
-using OutDataType      = int8_t;
+// kernel data types
+using InKernelDataType       = int8_t;
+using WeiKernelDataType      = int8_t;
+using AccDataType            = int32_t;
+using CShuffleDataType       = int8_t;
+using BiasKernelDataType     = int8_t;
+using ResidualKernelDataType = int8_t;
+using OutKernelDataType      = int8_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
            });

        return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                  CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,

--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
+add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
 add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
+add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
+
+if(USE_BITINT_EXTENSION_INT4)
+add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
+endif(USE_BITINT_EXTENSION_INT4)
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using B0DataType       = BF16;
+using B1DataType       = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = BF16;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    32,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    32,          // Gemm1KPerBlock
+    8,           // AK1
+    8,           // BK1
+    2,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    2,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
@@ -121,6 +121,7 @@ using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
                                                                                AElementOp,
                                                                                B0ElementOp,
                                                                                CElementOp>;
+
 using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
                                                                                B1DataType,
                                                                                CDataType,
@@ -129,243 +130,6 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
                                                                                B1ElementOp,
                                                                                CElementOp>;

-int main(int argc, char* argv[])
-{
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-
-    // GEMM shape
-    ck::index_t M             = 1024;
-    ck::index_t N             = 1024;
-    ck::index_t K             = 64;
-    ck::index_t O             = 128;
-    ck::index_t BatchCount    = 4;
-    ck::index_t StrideA       = -1;
-    ck::index_t StrideB0      = -1;
-    ck::index_t StrideB1      = -1;
-    ck::index_t StrideC       = -1;
-    ck::index_t BatchStrideA  = -1;
-    ck::index_t BatchStrideB0 = -1;
-    ck::index_t BatchStrideB1 = -1;
-    ck::index_t BatchStrideC  = -1;
-
-    if(argc == 1)
-    {
-        // use default case
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 9)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        O = std::stoi(argv[7]);
-
-        BatchCount = std::stoi(argv[8]);
-    }
-    else if(argc == 17)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        O = std::stoi(argv[7]);
-
-        BatchCount = std::stoi(argv[8]);
-
-        StrideA  = std::stoi(argv[9]);
-        StrideB0 = std::stoi(argv[10]);
-        StrideB1 = std::stoi(argv[11]);
-        StrideC  = std::stoi(argv[12]);
-
-        BatchStrideA  = std::stoi(argv[13]);
-        BatchStrideB0 = std::stoi(argv[14]);
-        BatchStrideB1 = std::stoi(argv[15]);
-        BatchStrideC  = std::stoi(argv[16]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 17: M, N, K, O, Batch, StrideA, StrideB0, StrideB1, StrideC, BatchStrideA, "
-               "BatchStrideB0, BatchStrideB1, BatchStrideC\n");
-        exit(0);
-    }
-
-    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
-    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
-    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
-    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
-
-    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
-    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
-    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
-    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
-
-    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
-    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
-    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
-    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
-
-    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
-    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
-    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
-    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
-
-    auto f_host_tensor_descriptor = [](std::size_t batch_count,
-                                       std::size_t row,
-                                       std::size_t col,
-                                       std::size_t stride,
-                                       std::size_t batch_stride,
-                                       auto layout) {
-        if(std::is_same<decltype(layout), Row>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
-                                        std::vector<std::size_t>({batch_stride, stride, 1}));
-        }
-        else
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
-                                        std::vector<std::size_t>({batch_stride, 1, stride}));
-        }
-    };
-
-    // C_m_o = A_m_k * B0_k_n * B1_n_o
-    Tensor<ADataType> a_g_m_k(
-        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
-    Tensor<B0DataType> b0_g_k_n(
-        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
-    Tensor<B1DataType> b1_g_n_o(
-        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
-    Tensor<CDataType> c_g_m_o_host_result(
-        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
-    Tensor<CDataType> c_g_m_o_device_result(
-        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
-
-    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
-    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
-    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
-    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
-        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
-        break;
-    case 2:
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
-        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
-        break;
-    default:
-        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
-        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
-    }
-
-    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSize());
-    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
-    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
-    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) * c_g_m_o_device_result.mDesc.GetElementSize());
-
-    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
-    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
-    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
-
-    auto a_element_op    = AElementOp{};
-    auto b0_element_op   = B0ElementOp{};
-    auto acc0_element_op = Acc0ElementOp{};
-    auto b1_element_op   = B1ElementOp{};
-    auto c_element_op    = CElementOp{};
-
-    // do GEMM
-    auto gemm    = DeviceGemmInstance{};
-    auto invoker = gemm.MakeInvoker();
-    auto argument =
-        gemm.MakeArgument(static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
-                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
-                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
-                          static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
-                          M,
-                          N,
-                          K,
-                          O,
-                          BatchCount,
-                          StrideA,
-                          StrideB0,
-                          StrideB1,
-                          StrideC,
-                          BatchStrideA,
-                          BatchStrideB0,
-                          BatchStrideB1,
-                          BatchStrideC,
-                          a_element_op,
-                          b0_element_op,
-                          acc0_element_op,
-                          b1_element_op,
-                          c_element_op);
-
-    if(!gemm.IsSupportedArgument(argument))
-    {
-        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
-
-        return 0;
-    }
-
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-
-    std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
-    std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
-                             sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
-                            BatchCount;
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << gemm.GetTypeString() << std::endl;
-
-    c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
-
-    if(do_verification)
-    {
-        // Output of Gemm0 is input A of Gemm1
-        Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
-
-        auto ref_gemm0          = ReferenceGemm0Instance{};
-        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
-        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
-            a_g_m_k, b0_g_k_n, a1_g_m_n, a_element_op, b0_element_op, PassThrough{});
-
-        ref_gemm0_invoker.Run(ref_gemm0_argument);
-
-        auto ref_gemm1          = ReferenceGemm1Instance{};
-        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
-        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
-            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
-
-        ref_gemm1_invoker.Run(ref_gemm1_argument);
-
-        return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData) ? 0 : 1;
-    }
+#include "run_batched_gemm_gemm_example.inc"

-    return 0;
-}
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using B0DataType       = F32;
+using B1DataType       = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F32;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    16,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    16,          // Gemm1KPerBlock
+    4,           // AK1
+    4,           // BK1
+    1,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    4,
+    4,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    4,
+    4,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    1,
+    false,
+    1,               // CShuffleMXdlPerWavePerShuffle
+    2,               // CShuffleNXdlPerWavePerShuffle
+    S<1, 16, 1, 16>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    4>;              // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }