Merge remote-tracking branch 'origin/develop' into cpu_avx2

e72c0c43 · carlushuang · d714fa15 · 313bbea5 · e72c0c43 · e72c0c43
Commit e72c0c43 authored Mar 26, 2022 by carlushuang
20 changed files
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -13,6 +13,7 @@
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
 #include "device_gemm_xdl_c_shuffle.hpp"
+#include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
 #include "gemm_specialization.hpp"
@@ -44,51 +45,12 @@ using CElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
 // clang-format off
-#if 0
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
+//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|      Num|
+//######|        |        |        | Type|  Type|  Type| DataType| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector|
-//######|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|
+//######|        |        |        |     |      |      |         |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
-//######|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|         |
+//######|        |        |        |     |      |      |         |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-//######|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
-//    [256, 128, 4, 8], 1 stage, 2 occupancy
-        <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1>;
-#elif 1
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
-//######|AData| BData| CData| AccData| Shuffle| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//######| Type|  Type|  Type|    Type|    Data|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//######|     |      |      |        |    Type|        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|     |      |      |        |        |        |        |        |            |            |            |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <  F16,   F16,   F16,     F32,     F16,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
-#elif 0
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
-//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|      Num|
-//######|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|
-//######|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|         |
-//######|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |
-//    [128, 144, 8, 8], 1 stage, 1 occupancy, bounded by LDS size
-//     99 TFlops, 120 blocks (1024x2160x3840)
-//     99 TFlops, 960 blocks (4096x4320x3840)
-        <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   144,     8,  8,   16,   16,    2,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,        1>;
-//    [128, 144, 4, 8], 1 stage, 2 occupancy,
-//     92 TFlops, 120 blocks (1024x2160x3840)
-//    120 TFlops, 240 blocks (1024x4320x3840)
-//    128 TFlops, 960 blocks (4096x4320x3840)
-//      <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   144,     4,  8,   16,   16,    2,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,        1>;
-//    [ 64, 144, 8, 8], 1 stage, 2 occupancy/
-//     96 TFlops, 240 blocks (1024x2160x3840)
-//     96 TFlops, 480 blocks (1024x4320x3840)
-//     99 TFlops,1920 blocks (4096x4320x3840)
-//      <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   144,     8,  8,   16,   16,    1,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,        1>;
-//    [ 64, 144, 8, 8], 2 stage, 2 occupancy
-//     93 TFlops
-//      <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   144,     8,  8,   16,   16,    1,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,        2>;
-//    [ 64, 144, 4, 8], 1 stage, 2 occupancy
-//     87 TFlops
-//      <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   144,     4,  8,   16,   16,    1,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,        1>;
-//    [ 64, 144, 4, 8], 2 stage, 2 occupancy
-//     85 TFlops
-//      <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   144,     4,  8,   16,   16,    1,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,        2>;
-#endif
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::
@@ -211,7 +173,22 @@ int main(int argc, char* argv[])
            "not support this GEMM problem");
    }
-    float ave_time = invoker.Run(argument, nrepeat);
+    // warm up
+    invoker.Run(argument);
+    // timing
+    KernelTimer timer;
+    timer.Start();
+    for(int i = 0; i < nrepeat; ++i)
+    {
+        invoker.Run(argument);
+    }
+    timer.End();
+    float ave_time = timer.GetElapsedTime() / nrepeat;
    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_btype =

--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
@@ -157,9 +157,9 @@ int main(int argc, char* argv[])
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<BDataType> c0_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c0_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<BDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<BDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -172,12 +172,12 @@ int main(int argc, char* argv[])
    case 1:
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        c0_m_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        c0_m_n.GenerateTensorValue(GeneratorTensor_2<CDataType>{-5, 5});
        break;
    default:
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        c0_m_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        c0_m_n.GenerateTensorValue(GeneratorTensor_3<CDataType>{-0.5, 0.5});
    }
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());

--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
@@ -139,8 +139,8 @@ int main(int argc, char* argv[])
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<BDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<BDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    // c0_n[n]
    Tensor<CDataType> c0_n(HostTensorDescriptor(

--- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
+++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
@@ -141,15 +141,15 @@ int main(int argc, char* argv[])
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<BDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<BDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    // c0_n[n]
    Tensor<CDataType> c0_n(HostTensorDescriptor(
        std::vector<std::size_t>({static_cast<std::size_t>(N)}), std::vector<std::size_t>({1})));
    // c1_m_n[m ,n]
-    Tensor<BDataType> c1_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c1_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;

--- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl.cpp
@@ -84,6 +84,9 @@ DeviceConvFwdBasePtr GetConvInstance(int num_dim_spatial)
 {
    switch(num_dim_spatial)
    {
+    case 3: {
+        return std::make_unique<DeviceConvNDFwdInstance<3>>();
+    }
    case 2: {
        return std::make_unique<DeviceConvNDFwdInstance<2>>();
    }
@@ -173,6 +176,9 @@ HostTensorDescriptor GetOutputHostTensorDescriptor(const std::vector<std::size_t
    switch(num_dim_spatial)
    {
+    case 3: {
+        return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWK{});
+    }
    case 2: {
        return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWK{});
    }
@@ -192,6 +198,9 @@ HostTensorDescriptor GetFiltersHostTensorDescriptor(const std::vector<std::size_
    switch(num_dim_spatial)
    {
+    case 3: {
+        return ck::conv_util::GetHostTensorDescriptor(dims, tl::KZYXC{});
+    }
    case 2: {
        return ck::conv_util::GetHostTensorDescriptor(dims, tl::KYXC{});
    }
@@ -211,6 +220,9 @@ HostTensorDescriptor GetInputHostTensorDescriptor(const std::vector<std::size_t>
    switch(num_dim_spatial)
    {
+    case 3: {
+        return ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{});
+    }
    case 2: {
        return ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
    }
@@ -360,6 +372,11 @@ int main(int argc, char* argv[])
        switch(num_dim_spatial)
        {
+        case 3: {
+            auto ref_conv = ReferenceConvNDFwdInstance<3>();
+            verify_f(ref_conv);
+            break;
+        }
        case 2: {
            auto ref_conv = ReferenceConvNDFwdInstance<2>();
            verify_f(ref_conv);

--- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
@@ -180,6 +180,10 @@ int main(int argc, char* argv[])
    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    // reset input to zero
+    in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{0});
+    in_device_buf.ToDevice(in_n_c_hi_wi_device_result.mData.data());
    // do GEMM
    auto conv     = DeviceConvBwdDataInstance{};
    auto invoker  = conv.MakeInvoker();

--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -37,7 +37,7 @@ cmake                                                                  \
 ```bash
 # -D <xxx> : input 4-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
-#arg1: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg2: run kernel # of times (>1)
 ./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
 ```

--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -13,7 +13,7 @@
 #include "device_base.hpp"
 #include "device_reduce_blockwise.hpp"
 #include "host_reduce_util.hpp"
-#include "host_generic_reduction.hpp"
+#include "host_reduction.hpp"
 #include "reduction_enums.hpp"
 #include "reduction_operator_mapping.hpp"
@@ -21,13 +21,13 @@
 using namespace ck;
 using namespace ck::tensor_operation::device;
-using InDataType  = half_float::half;
+using InDataType  = ck::half_t;
-using OutDataType = half_float::half;
+using OutDataType = ck::half_t;
 using AccDataType = float;
-using kInDataType  = ck::half_t;
+using HostInDataType  = half_float::half;
-using kOutDataType = ck::half_t;
+using HostOutDataType = half_float::half;
-using kAccDataType = float;
+using HostAccDataType = float;
 constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;
@@ -43,9 +43,9 @@ using InElementwiseOperation =
 using AccElementwiseOperation =
    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
-using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
+using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
-                                                   kAccDataType,
+                                                   AccDataType,
-                                                   kOutDataType,
+                                                   OutDataType,
                                                   Rank,
                                                   NumReduceDim,
                                                   ReduceOperation,
@@ -135,6 +135,10 @@ class SimpleAppArgs
        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
                     "comparing with the host-based reduction"
                  << std::endl;
+        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
    };
    int processArgs(int argc, char* argv[])
@@ -263,20 +267,21 @@ int main(int argc, char* argv[])
    {
        switch(args.init_method)
        {
-        case 0:
+        case 0: break;
-            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
            if(beta != 0.0f)
-                out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
            break;
-        case 1:
+        case 2:
            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
            if(beta != 0.0f)
                out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
            break;
        default:
-            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+            in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
            if(beta != 0.0f)
-                out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
        }
        if(beta != 0.0f)
@@ -293,17 +298,27 @@ int main(int argc, char* argv[])
    if(beta != 0.0f)
        out_dev.ToDevice(out.mData.data());
-    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
    DeviceMem out_indices_dev(indicesSizeInBytes);
    if(args.do_verification)
    {
-        ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
+        ReductionHost<HostInDataType,
+                      HostAccDataType,
+                      HostOutDataType,
+                      ReduceOpId,
+                      Rank,
+                      NumReduceDim,
+                      PropagateNan,
+                      NeedIndices>
            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-        hostReduce.Run(
+        hostReduce.Run(alpha,
-            alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
+                       reinterpret_cast<const HostInDataType*>(in.mData.data()),
+                       beta,
+                       reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
+                       out_indices_ref.mData.data());
    };
    const auto i_inLengths  = to_int_vector(args.inLengths);
@@ -313,7 +328,7 @@ int main(int argc, char* argv[])
    auto reduce = DeviceReduceInstance{};
-    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths);
+    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
    DeviceMem ws_dev(wsSizeInBytes);

--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -36,7 +36,7 @@ cmake                                                                  \
 ## Run ```pool2d_fwd```
 ```bash
 #arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
 ./example/pool2d_fwd 1 1 10

--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -236,8 +236,9 @@ int main(int argc, char* argv[])
    switch(init_method)
    {
    case 0: break;
-    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
-    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+    case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());

--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
+add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
--- a/example/15_grouped_gemm/README.md
+++ b/example/15_grouped_gemm/README.md
+# Instructions for ```grouped_gemm_xdl``` Example
+## Docker script
+```bash
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
+/bin/bash
+```
+## Build ```grouped_gemm_xdl```
+```bash
+mkdir build && cd build
+```
+```bash
+# Need to specify target ID, example below is gfx908
+cmake                                                                  \
+-D BUILD_DEV=OFF                                                       \
+-D CMAKE_BUILD_TYPE=Release                                            \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
+..
+```
+```bash
+ make -j example_grouped_gemm_xdl_fp16
+```
+## Run ```grouped_gemm_xdl```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+./bin/example_grouped_gemm_xdl_fp16 0 1 5
+```
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+gemm[0] a_m_k: dim 2, lengths {256, 64}, strides {64, 1} b_k_n: dim 2, lengths {64, 128}, strides {1, 64} c_m_n: dim 2, lengths {256, 128}, strides {128, 1}
+gemm[1] a_m_k: dim 2, lengths {512, 128}, strides {128, 1} b_k_n: dim 2, lengths {128, 256}, strides {1, 128} c_m_n: dim 2, lengths {512, 256}, strides {256, 1}
+gemm[2] a_m_k: dim 2, lengths {768, 192}, strides {192, 1} b_k_n: dim 2, lengths {192, 384}, strides {1, 192} c_m_n: dim 2, lengths {768, 384}, strides {384, 1}
+gemm[3] a_m_k: dim 2, lengths {1024, 256}, strides {256, 1} b_k_n: dim 2, lengths {256, 512}, strides {1, 256} c_m_n: dim 2, lengths {1024, 512}, strides {512, 1}
+group: 0 arg.a_grid_desc_k0_m_k1_{8, 256, 8}, arg.b_grid_desc_k0_n_k1_{8, 128, 8}, arg.c_grid_desc_m_n_{ 256, 128}
+group: 1 arg.a_grid_desc_k0_m_k1_{16, 512, 8}, arg.b_grid_desc_k0_n_k1_{16, 256, 8}, arg.c_grid_desc_m_n_{ 512, 256}
+group: 2 arg.a_grid_desc_k0_m_k1_{24, 768, 8}, arg.b_grid_desc_k0_n_k1_{24, 384, 8}, arg.c_grid_desc_m_n_{ 768, 384}
+group: 3 arg.a_grid_desc_k0_m_k1_{32, 1024, 8}, arg.b_grid_desc_k0_n_k1_{32, 512, 8}, arg.c_grid_desc_m_n_{ 1024, 512}
+launch_and_time_kernel: grid_dim {30, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 5 times...
+Perf: 0.037887 ms, 11.0706 TFlops, 90.8132 GB/s, DeviceGroupedGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2>
+```
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_grouped_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
+// static constexpr auto GemmMNPadding =
+// ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl
+//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|      Num|
+//######|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|
+//######|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|         |
+//######|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |
+        <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        exit(0);
+    }
+    int group_count = 4;
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::GemmShape> gemm_shapes;
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_c;
+    gemm_shapes.reserve(group_count);
+    for(int i = 0; i < group_count; i++)
+    {
+        int M = 256 + 256 * i;
+        int N = 128 + 128 * i;
+        int K = 64 + 64 * i;
+        gemm_shapes.push_back({M, N, K, K, K, N});
+    }
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+    std::vector<Tensor<ADataType>> a_tensors;
+    ;
+    std::vector<Tensor<BDataType>> b_tensors;
+    std::vector<Tensor<CDataType>> c_host_tensors;
+    std::vector<Tensor<CDataType>> c_device_tensors;
+    a_tensors.reserve(group_count);
+    b_tensors.reserve(group_count);
+    c_host_tensors.reserve(group_count);
+    c_device_tensors.reserve(group_count);
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+    std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, c_tensors_device;
+    a_tensors_device.reserve(group_count);
+    b_tensors_device.reserve(group_count);
+    c_tensors_device.reserve(group_count);
+    std::size_t flop = 0, num_btype = 0;
+    for(int i = 0; i < gemm_shapes.size(); i++)
+    {
+        a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
+            gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
+        b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor(
+            gemm_shapes[i].K, gemm_shapes[i].N, gemm_shapes[i].StrideB, BLayout{})));
+        c_host_tensors.push_back(Tensor<CDataType>(f_host_tensor_descriptor(
+            gemm_shapes[i].M, gemm_shapes[i].N, gemm_shapes[i].StrideC, CLayout{})));
+        c_device_tensors.push_back(Tensor<CDataType>(f_host_tensor_descriptor(
+            gemm_shapes[i].M, gemm_shapes[i].N, gemm_shapes[i].StrideC, CLayout{})));
+        std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc
+                  << " b_k_n: " << b_tensors[i].mDesc << " c_m_n: " << c_device_tensors[i].mDesc
+                  << std::endl;
+        flop += std::size_t(2) * gemm_shapes[i].M * gemm_shapes[i].K * gemm_shapes[i].N;
+        num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() +
+                     sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() +
+                     sizeof(CDataType) * c_device_tensors[i].mDesc.GetElementSize();
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            break;
+        case 2:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            break;
+        default:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        }
+    }
+    for(int i = 0; i < gemm_shapes.size(); i++)
+    {
+        a_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpace()));
+        b_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpace()));
+        c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(CDataType) * c_device_tensors[i].mDesc.GetElementSpace()));
+        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
+        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
+        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
+        p_b.push_back(b_tensors_device[i]->GetDeviceBuffer());
+        p_c.push_back(c_tensors_device[i]->GetDeviceBuffer());
+    }
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+    // do GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    auto argument =
+        gemm.MakeArgument(p_a, p_b, p_c, gemm_shapes, a_element_op, b_element_op, c_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    float ave_time = invoker.Run(argument, nrepeat);
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        for(int i = 0; i < gemm_shapes.size(); i++)
+        {
+            c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+            auto ref_argument = ref_gemm.MakeArgument(a_tensors[i],
+                                                      b_tensors[i],
+                                                      c_host_tensors[i],
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op);
+            ref_invoker.Run(ref_argument);
+            check_error(c_host_tensors[i], c_device_tensors[i]);
+        }
+    }
+    return 0;
+}
--- a/example/16_gemm_reduce/CMakeLists.txt
+++ b/example/16_gemm_reduce/CMakeLists.txt
+add_example_executable(example_gemm_reduce_xdl_fp16 gemm_reduce_xdl_fp16.cpp)
--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "device_gemm_reduce_xdl_cshuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+#include "element_wise_reduce_operation.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using ADataType = F16;
+using BDataType = F16;
+using CDataType = F16;
+using DDataType = F32;
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
+using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum;
+static constexpr auto GemmSpecialization =
+    ck::tensor_operation::device::GemmSpecialization_t::Default;
+// clang-format off
+using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle
+//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|         D0|         D1|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|  Type| Elementwise| Elementwise| Elementwise|     Reduce|     Reduce|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+//######|        |        |        |     |      |      |         |         |          |      |   Operation|   Operation|   Operation|  Operation|  Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+//######|        |        |        |     |      |      |         |         |          |      |            |            |            |           |           |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32,  AElementOp,  BElementOp,  CElementOp, D0ReduceOp, D1ReduceOp, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = 1;
+    int init_method      = 1;
+    int nrepeat          = 5;
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<DDataType> d0_m_host_result(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<DDataType> d1_m_host_result(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<DDataType> d0_m_device_result(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<DDataType> d1_m_device_result(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
+    std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem d0_device_buf(sizeof(DDataType) * d0_m_device_result.mDesc.GetElementSpace());
+    DeviceMem d1_device_buf(sizeof(DDataType) * d1_m_device_result.mDesc.GetElementSpace());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+    auto d0_reduce_op = D0ReduceOp{};
+    auto d1_reduce_op = D1ReduceOp{};
+    // do GEMM
+    auto gemm     = DeviceGemmReduceInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                      static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
+                                      static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op,
+                                      d0_reduce_op,
+                                      d1_reduce_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    // warm up
+    invoker.Run(argument);
+    // timing
+    float total_time = 0;
+    for(int i = 0; i < nrepeat; ++i)
+    {
+        // init DO, D1 to 0
+        d0_device_buf.SetZero();
+        d1_device_buf.SetZero();
+        KernelTimer timer;
+        timer.Start();
+        invoker.Run(argument);
+        timer.End();
+        total_time += timer.GetElapsedTime();
+    }
+    float ave_time = total_time / nrepeat;
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        d0_device_buf.FromDevice(d0_m_device_result.mData.data());
+        d1_device_buf.FromDevice(d1_m_device_result.mData.data());
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < M; ++m)
+        {
+            float d0_acc = d0_reduce_op.GetReduceZeroValue();
+            float d1_acc = d1_reduce_op.GetReduceZeroValue();
+            for(int n = 0; n < N; ++n)
+            {
+                d0_reduce_op.Reduce(d0_acc, c_m_n_host_result(m, n));
+                d1_reduce_op.Reduce(d1_acc, c_m_n_host_result(m, n));
+            }
+            d0_m_host_result(m) = d0_acc;
+            d1_m_host_result(m) = d1_acc;
+        }
+        check_error(c_m_n_host_result, c_m_n_device_result);
+        check_error(d0_m_host_result, d0_m_device_result);
+        check_error(d1_m_host_result, d1_m_device_result);
+    }
+    return 0;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -39,3 +39,5 @@ add_subdirectory(11_conv2d_bwd_wgt)
 add_subdirectory(12_reduce)
 add_subdirectory(13_pool2d_fwd)
 add_subdirectory(14_gemm_xdl_requant_relu_requant)
+add_subdirectory(15_grouped_gemm)
+add_subdirectory(16_gemm_reduce)
--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -157,16 +157,22 @@
 #define CK_WORKAROUND_SWDEV_325164 1
 #endif
+// workaround for verification failure ConvNd forward
+// https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/135
+#ifndef CK_WORKAROUND_GITHUB_135
+#define CK_WORKAROUND_GITHUB_135 1
+#endif
 namespace ck {
-enum InMemoryDataOperationEnum_t
+enum struct InMemoryDataOperationEnum_t
 {
    Set,
    AtomicAdd,
    Add
 };
-enum ActivTypeEnum_t
+enum struct ActivTypeEnum_t
 {
    None,
    LeakyRelu,

--- a/include/ck/tensor_operation/gpu/device/conv_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/conv_utils.hpp
@@ -186,6 +186,28 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
        return HostTensorDescriptor(
            dims, std::vector<std::size_t>{C * dims[2] * dims[3], 1, dims[3] * C, C});
    }
+    // 3D
+    else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCDHW>::value ||
+                      std::is_same<TensorLayout, ck::tensor_layout::convolution::KCZYX>::value ||
+                      std::is_same<TensorLayout, ck::tensor_layout::convolution::NKDHW>::value)
+    {
+        return HostTensorDescriptor(dims,
+                                    std::vector<std::size_t>{C * dims[2] * dims[3] * dims[4],
+                                                             dims[2] * dims[3] * dims[4],
+                                                             dims[3] * dims[4],
+                                                             dims[4],
+                                                             1});
+    }
+    else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWC>::value ||
+                      std::is_same<TensorLayout, ck::tensor_layout::convolution::KZYXC>::value ||
+                      std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWK>::value)
+    {
+        return HostTensorDescriptor(
+            dims,
+            std::vector<std::size_t>{
+                C * dims[2] * dims[3] * dims[4], 1, C * dims[3] * dims[4], C * dims[4], C});
+    }
    std::stringstream err_msg;
    err_msg << "Unsupported data layout provided: " << layout << "!";

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
@@ -5,7 +5,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-enum ConvolutionBackwardDataSpecialization_t
+enum struct ConvolutionBackwardDataSpecialization_t
 {
    Default,
    Filter1x1Stride1Pad0,

--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
 #ifndef CONVOLUTION_FORWARD_SPECIALIZATION
 #define CONVOLUTION_FORWARD_SPECIALIZATION
+#include <string>
 namespace ck {
 namespace tensor_operation {
 namespace device {
-enum ConvolutionForwardSpecialization_t
+enum struct ConvolutionForwardSpecialization_t
 {
    Default,
    Filter1x1Pad0,
@@ -13,6 +15,18 @@ enum ConvolutionForwardSpecialization_t
    OddC,
 };
+inline std::string getConvFwdSpecializationStr(const ConvolutionForwardSpecialization_t& s)
+{
+    switch(s)
+    {
+    case ConvolutionForwardSpecialization_t::Default: return "Default";
+    case ConvolutionForwardSpecialization_t::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
+    case ConvolutionForwardSpecialization_t::OddC: return "OddC";
+    default: return "Unrecognized specialization!";
+    }
+}
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck