Merge branch 'develop' into aosewski/wavelet_omniperf

dc0bae32 · Adam Osewski · 68474822 · ba40c2ce · dc0bae32 · dc0bae32
Commit dc0bae32 authored Feb 01, 2023 by Adam Osewski
20 changed files
--- a/example/44_conv2d_fwd_quant/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/44_conv2d_fwd_quant/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -11,6 +11,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
@@ -150,14 +151,14 @@ bool run_grouped_conv_fwd(bool do_verification,
    auto invoker  = conv.MakeInvoker();
    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
                                      wei_device_buf.GetDeviceBuffer(),
-                                      std::array<const void*, 0>{},
+                                      {},
                                      out_device_buf.GetDeviceBuffer(),
                                      a_g_n_c_wis_lengths,
                                      a_g_n_c_wis_strides,
                                      b_g_k_c_xs_lengths,
                                      b_g_k_c_xs_strides,
-                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      {},
+                                      {},
                                      e_g_n_k_wos_lengths,
                                      e_g_n_k_wos_strides,
                                      conv_filter_strides,
@@ -213,8 +214,8 @@ bool run_grouped_conv_fwd(bool do_verification,

        out_device_buf.FromDevice(out_device.mData.data());

-        pass &= ck::utils::check_err(
-            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
    }

    return (pass ? 0 : 1);

--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
 add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
+add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -3,7 +3,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -19,7 +19,7 @@ using BDataType = F16;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
+    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
                                                        ck::Tuple<BDataType>,
                                                        PassThrough,
                                                        4,

--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+
+using ADataType = F16;
+using BDataType = F16;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
+                                                          ck::Tuple<BDataType>,
+                                                          PassThrough,
+                                                          3, // NumDim_M
+                                                          1, // NumDim_N
+                                                          8,
+                                                          8,
+                                                          ck::Sequence<8>,
+                                                          ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc,
+                        const HostTensorA& A_nchw,
+                        const std::vector<std::size_t>& shape_nchw,
+                        Functor functor)
+{
+    for(std::size_t n = 0; n < shape_nchw[0]; ++n)
+        for(std::size_t c = 0; c < shape_nchw[1]; ++c)
+            for(std::size_t h = 0; h < shape_nchw[2]; ++h)
+                for(std::size_t w = 0; w < shape_nchw[3]; ++w)
+                {
+                    auto a_val = A_nchw(n, c, h, w);
+                    functor(B_nhwc(n, h, w, c), a_val);
+                }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    const int N = 120;
+    const int C = 128;
+    const int H = 32;
+    const int W = 1024;
+
+    /**const int N = 120;
+    const int H = 32;
+    const int W = 64;
+
+    const int C = 128;**/
+
+    std::vector<std::size_t> nchw = {N, C, H, W};
+    std::vector<std::size_t> nhwc = {N, H, W, C};
+
+    Tensor<ADataType> a(nchw);
+    Tensor<BDataType> b(nhwc);
+
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a.mData.data());
+    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;
+
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
+
+    std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
+    std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
+
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    std::cout << "A (nchw): " << a.mDesc << std::endl;
+    std::cout << "B (nhwc): " << b.mDesc << std::endl;
+
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+
+    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;
+
+        Tensor<BDataType> host_b(nhwc);
+        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
+            host_b, a, nchw, PassThrough{});
+
+        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
--- a/example/46_gemm_add_multiply/CMakeLists.txt
+++ b/example/46_gemm_add_multiply/CMakeLists.txt
+add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp)
+add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp)
--- a/example/46_gemm_add_multiply/README.md
+++ b/example/46_gemm_add_multiply/README.md
+# Instructions for ```example_gemm_add_multiply_dl_fp16```
+
+## Run ```example_gemm_add_multiply_dl_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
+./bin/example_gemm_add_multiply_dl_fp16 1 1 1
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
+d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
+d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
+arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
+arg.e_grid_desc_m_n_{ 3840, 4096}
+launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 3.99904 ms, 32.22 TFlops, 31.9913 GB/s, DeviceGemmMultipleD_Dl<256, 128, 128, 16, 2, 4, 4, 1>
+```
--- a/example/46_gemm_add_multiply/common.hpp
+++ b/example/46_gemm_add_multiply/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddMultiply = ck::tensor_operation::element_wise::AddMultiply;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+using I8   = int8_t;
+using I32  = int32_t;
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline bool
+parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
+{
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+
+        problem_size.StrideA  = std::stoi(argv[7]);
+        problem_size.StrideB  = std::stoi(argv[8]);
+        problem_size.StrideD0 = std::stoi(argv[9]);
+        problem_size.StrideD1 = std::stoi(argv[10]);
+        problem_size.StrideE  = std::stoi(argv[11]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
+                     "StrideE"
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
--- a/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
+
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = F32;
+using D0DataType  = F16;
+using D1DataType  = F16;
+using DsDataType  = ck::Tuple<D0DataType, D1DataType>;
+using EDataType   = F16;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddMultiply;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::
+        //  ##################| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|     DsData|     EData|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ##################|        |        |         |        |      Type|      Type|        Type|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ##################|        |        |         |        |          |          |            |           |          |   Operation|   Operation|    Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ##################|        |        |         |        |          |          |            |           |          |            |            |             |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmMultipleD_Dl< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_multiply_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_multiply_example(argc, argv); }
--- a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = F32;
+using D0DataType  = F16;
+using D1DataType  = F16;
+using DsDataType  = ck::Tuple<D0DataType, D1DataType>;
+using EDataType   = F16;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddMultiply;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::
+        //##############################|      A|      B|       Ds|      E| AData| BData| AccData| CShuffle|     DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|   Layout| Layout|  Type|  Type|    Type| DataType|       Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |         |       |      |      |        |         |           |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |         |       |      |      |        |         |           |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, DsLayout,    Row,   F16,   F16,     F32,      F16, DsDataType,   F16, PassThrough, PassThrough, CDEElementOp,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_multiply_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_multiply_example(argc, argv); }
--- a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
+++ b/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
+#pragma once
+
+bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-1, 1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+	    std::cout << "wrong! this device_op instance does not support this problem" << std::endl;
+	    return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = 2_uz * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
+                            sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_add_multiply_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) ||
+           run_gemm_add_multiply(problem_size, config);
+}
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -30,7 +30,7 @@
 // check GPU target
 #ifdef __HIP_DEVICE_COMPILE__
 #if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-      defined(__gfx90a__) || defined(__gfx1030__))
+      defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__))
 #error Not supported target
 #endif
 #endif
@@ -43,6 +43,8 @@
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
 #elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+#elif defined(__gfx1100__) // for GPU code
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
 #endif

 // FMA instruction
@@ -67,6 +69,13 @@
 #define CK_USE_AMD_MFMA_BF16_1K_OP
 #endif

+// WMMA instruction
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_USE_AMD_WMMA
+#elif defined(__gfx1100__) // for GPU code
+#define CK_USE_AMD_WMMA
+#endif
+
 // buffer load
 #define CK_USE_AMD_BUFFER_LOAD 1

@@ -166,6 +175,9 @@
 #define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 0
 #endif // __gfx908__

+// flag to enable (1) or disable (0) the debugging output in some kernels
+#define DEBUG_LOG 0
+
 namespace ck {

 enum struct InMemoryDataOperationEnum

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+#define CK_MNK_LOOP
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerWMMA,
+          index_t NPerWMMA,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+/* A: K0PerBlock x MPerBlock x K1
+ * B: K0PerBlock x NPerBlock x K1
+ * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs
+ * KPACK == WMMA_K = 16
+ */
+struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle
+{
+    static constexpr auto I0    = Number<0>{};
+    static constexpr auto I1    = Number<1>{};
+    static constexpr auto I2    = Number<2>{};
+    static constexpr auto I3    = Number<3>{};
+    static constexpr auto I4    = Number<4>{};
+    static constexpr auto WmmaK = Number<16>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one.
+    static constexpr index_t WaveSize = 32;
+
+    static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t KPerBlock =
+        BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr auto wmma_gemm =
+        WmmaGemm<FloatA, FloatB, FloatAcc, MPerWMMA, NPerWMMA, KPack>{};
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA);
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              FloatAcc,
+                              MRepeat * NRepeat,
+                              wmma_gemm.GetRegSizePerWmma(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex();
+        //  |KRepeat   |MRepeat|MWave      |MLane       |KPack
+        return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex();
+        //  |KRepeat   |NRepeat|Nwave      |NLane       |KPack
+        return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0);
+    }
+
+    template <index_t m0, index_t n0>
+    __device__ static auto CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk();
+
+        constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle()
+    {
+        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
+                          BK0NK1BlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 &&
+                          NPerBlock % (NPerWMMA * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    // Thread level, register decriptor. Vector-write
+    __host__ __device__ static constexpr auto
+    GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
+            wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
+
+        constexpr auto MSubGroup          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0];
+        constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1];
+        constexpr auto MAccVgprs          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
+
+        return make_naive_tensor_descriptor_packed(
+            //        |MRepeat           |MWave |MSubGroup |NRepeat           |NWave
+            //        |NThreadPerSubGroup |MAccVgprs
+            make_tuple(Number<MRepeat>{},
+                       I1,
+                       MSubGroup,
+                       Number<NRepeat>{},
+                       I1,
+                       NThreadPerSubGroup,
+                       MAccVgprs));
+    }
+
+    // Provide dimension size
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<MPerWMMA>{},
+                                                           Number<NRepeat>{},
+                                                           Number<NWaves>{},
+                                                           Number<NPerWMMA>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
+    }
+
+    __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1()
+    {
+        return transform_tensor_descriptor(
+            AK0MK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<A_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerWMMA>{})),
+                       make_pass_through_transform(Number<A_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1()
+    {
+        return transform_tensor_descriptor(
+            BK0NK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<B_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NWaves>{}, Number<NPerWMMA>{})),
+                       make_pass_through_transform(Number<B_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma
+    static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1();
+    static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1();
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                // read A
+                a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                                   make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(I0, m0, I0, I0, I0),
+                                   a_thread_buf);
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read B
+                    b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                       make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(I0, n0, I0, I0, I0),
+                                       b_thread_buf);
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto i) {
+                        a_thread_vec.template AsType<FloatA>()(i) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(i) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                    });
+
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                });
+            });
+        });
+    }
+
+    protected:
+    // A[K0, M0, M1, M2, K1]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / A_K1>{}, Number<MRepeat>{}, I1, I1, Number<A_K1>{}));
+
+    // B[K0, N0, N1, N2, K1]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / B_K1>{}, Number<NRepeat>{}, I1, I1, Number<B_K1>{}));
+
+    // C[M, N, NumRegWMMA]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, wmma_gemm.GetRegSizePerWmma()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<WmmaK / A_K1, 1, 1, 1, A_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<WmmaK / B_K1, 1, 1, 1, B_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+};
+
+// block wise level pipe designed for inline asm
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerWMMA,
+          index_t NPerWMMA,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+/* A: K0PerBlock x MPerBlock x K1
+ * B: K0PerBlock x NPerBlock x K1
+ * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs
+ * KPACK == WMMA_K = 16
+ */
+struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO
+{
+    static constexpr auto I0    = Number<0>{};
+    static constexpr auto I1    = Number<1>{};
+    static constexpr auto I2    = Number<2>{};
+    static constexpr auto I3    = Number<3>{};
+    static constexpr auto I4    = Number<4>{};
+    static constexpr auto WmmaK = Number<16>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one.
+    static constexpr index_t WaveSize = 32;
+
+    static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t KPerBlock =
+        BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr auto wmma_gemm =
+        WmmaGemm<FloatA, FloatB, FloatAcc, MPerWMMA, NPerWMMA, KPack>{};
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA);
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              FloatAcc,
+                              MRepeat * NRepeat,
+                              wmma_gemm.GetRegSizePerWmma(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex();
+        //  |KRepeat   |MRepeat|MWave      |MLane       |KPack
+        return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex();
+        //  |KRepeat   |NRepeat|Nwave      |NLane       |KPack
+        return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0);
+    }
+
+    template <index_t m0, index_t n0>
+    __device__ static auto CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk();
+
+        constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO()
+    {
+        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
+                          BK0NK1BlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 &&
+                          NPerBlock % (NPerWMMA * NRepeat) == 0,
+                      "wrong!");
+    }
+    // Thread level, register decriptor. Vector-write
+    __host__ __device__ static constexpr auto
+    GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
+            wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
+
+        constexpr auto MSubGroup          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0];
+        constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1];
+        constexpr auto MAccVgprs          = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
+
+        return make_naive_tensor_descriptor_packed(
+            //        |MRepeat           |MWave |MSubGroup |NRepeat           |NWave
+            //        |NThreadPerSubGroup |MAccVgprs
+            make_tuple(Number<MRepeat>{},
+                       I1,
+                       MSubGroup,
+                       Number<NRepeat>{},
+                       I1,
+                       NThreadPerSubGroup,
+                       MAccVgprs));
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+        const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma =
+            transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(
+                    make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)),
+                    make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma);
+    }
+
+    // Provide dimension size
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<MPerWMMA>{},
+                                                           Number<NRepeat>{},
+                                                           Number<NWaves>{},
+                                                           Number<NPerWMMA>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
+    }
+
+    __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1()
+    {
+        return transform_tensor_descriptor(
+            AK0MK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<A_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerWMMA>{})),
+                       make_pass_through_transform(Number<A_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1()
+    {
+        return transform_tensor_descriptor(
+            BK0NK1BlockDesc{},
+            make_tuple(make_pass_through_transform(Number<B_K0>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NWaves>{}, Number<NPerWMMA>{})),
+                       make_pass_through_transform(Number<B_K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
+    }
+
+    // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma
+    static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1();
+    static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1();
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        constexpr auto RepeatDiff = MRepeat - NRepeat;
+        // Read all Mrepeat, Nrepeat
+        static_for<0, NRepeat, 1>{}([&](auto iN) {
+            b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                               make_tuple(I0, Number<iN>{}, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, Number<iN>{}, I0, I0, I0),
+                               b_thread_buf);
+        });
+
+        static_for<0, MRepeat, 1>{}([&](auto iM) {
+            a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                               make_tuple(I0, Number<iM>{}, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, Number<iM>{}, I0, I0, I0),
+                               a_thread_buf);
+        });
+
+        // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat
+        static_for<0, RepeatDiff, 1>{}([&](auto iCut) {
+            static_for<0, NRepeat, 1>{}([&](auto iN) {
+                vector_type<FloatA, WmmaK> a_thread_vec;
+                vector_type<FloatB, WmmaK> b_thread_vec;
+
+                static_for<0, WmmaK, 1>{}([&](auto iK) {
+                    a_thread_vec.template AsType<FloatA>()(iK) =
+                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            make_tuple(iK / A_K1, iCut, 0, 0, iK % A_K1))>{}];
+                    b_thread_vec.template AsType<FloatB>()(iK) =
+                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                });
+                using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                constexpr index_t c_offset =
+                    c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0));
+                // s_nop();
+                wmma_gemm.template Run(
+                    a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                    b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                // s_nop();
+            });
+            if constexpr(KPerBlock > WmmaK)
+            {
+                // Read Consumed Next inner loop A
+                a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                                   make_tuple(Number<WmmaK / A_K1>{}, Number<iCut>{}, I0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(I0, Number<iCut>{}, I0, I0, I0),
+                                   a_thread_buf);
+            }
+        });
+
+        static_for<WmmaK, KPerBlock, WmmaK>{}([&](auto iWmmaK) {
+            // Stage 2: Run FIFO fashion loopover in Square
+            static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) {
+                // Row Repeatation
+                static_for<WmmaInnerloop, NRepeat, 1>{}([&](auto iN) {
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto iK) {
+                        a_thread_vec.template AsType<FloatA>()(iK) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                iK / A_K1, WmmaInnerloop + RepeatDiff, 0, 0, iK % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(iK) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                    });
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                        make_tuple(WmmaInnerloop + RepeatDiff, iN, 0));
+                    // s_nop();
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    // s_nop();
+                });
+
+                // Read Consumed Next inner loop A
+                a_thread_copy_.Run(
+                    a_block_desc_k0_m0_m1_m2_k1,
+                    make_tuple(
+                        Number<iWmmaK / A_K1>{}, Number<WmmaInnerloop + RepeatDiff>{}, I0, I0, I0),
+                    a_block_buf,
+                    a_thread_desc_,
+                    make_tuple(I0, Number<WmmaInnerloop + RepeatDiff>{}, I0, I0, I0),
+                    a_thread_buf);
+
+                // Col Repeatation
+                static_for<WmmaInnerloop + 1 + RepeatDiff, MRepeat, 1>{}([&](auto iM) {
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto iK) {
+                        a_thread_vec.template AsType<FloatA>()(iK) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                make_tuple(iK / A_K1, iM, 0, 0, iK % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(iK) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(iK / B_K1, WmmaInnerloop, 0, 0, iK % B_K1))>{}];
+                    });
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0));
+                    // s_nop();
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    // s_nop();
+                });
+                // Read Consumed Next inner loop B
+                b_thread_copy_.Run(
+                    b_block_desc_k0_n0_n1_n2_k1,
+                    make_tuple(Number<iWmmaK / B_K1>{}, Number<WmmaInnerloop>{}, I0, I0, I0),
+                    b_block_buf,
+                    b_thread_desc_,
+                    make_tuple(I0, Number<WmmaInnerloop>{}, I0, I0, I0),
+                    b_thread_buf);
+            });
+
+            // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat
+            static_for<0, RepeatDiff, 1>{}([&](auto iCut) {
+                static_for<0, NRepeat, 1>{}([&](auto iN) {
+                    vector_type<FloatA, WmmaK> a_thread_vec;
+                    vector_type<FloatB, WmmaK> b_thread_vec;
+
+                    static_for<0, WmmaK, 1>{}([&](auto iK) {
+                        a_thread_vec.template AsType<FloatA>()(iK) =
+                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                make_tuple(iK / A_K1, iCut, 0, 0, iK % A_K1))>{}];
+                        b_thread_vec.template AsType<FloatB>()(iK) =
+                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                    });
+                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0));
+                    // s_nop();
+                    wmma_gemm.template Run(
+                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    // s_nop();
+                });
+                if constexpr(KPerBlock > WmmaK)
+                {
+                    a_thread_copy_.Run(
+                        a_block_desc_k0_m0_m1_m2_k1,
+                        make_tuple(Number<(iWmmaK + WmmaK) / A_K1>{}, Number<iCut>{}, I0, I0, I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(I0, Number<iCut>{}, I0, I0, I0),
+                        a_thread_buf);
+                }
+            });
+        });
+
+        // Stage 2: Run FIFO fashion loopover in Square
+        static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) {
+            // Row Repeatation
+            static_for<WmmaInnerloop, NRepeat, 1>{}([&](auto iN) {
+                vector_type<FloatA, WmmaK> a_thread_vec;
+                vector_type<FloatB, WmmaK> b_thread_vec;
+
+                static_for<0, WmmaK, 1>{}([&](auto iK) {
+                    a_thread_vec.template AsType<FloatA>()(iK) =
+                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            make_tuple(iK / A_K1, WmmaInnerloop + RepeatDiff, 0, 0, iK % A_K1))>{}];
+                    b_thread_vec.template AsType<FloatB>()(iK) =
+                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}];
+                });
+                using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                constexpr index_t c_offset =
+                    c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop + RepeatDiff, iN, 0));
+                // s_nop();
+                wmma_gemm.template Run(
+                    a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                    b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                // s_nop();
+            });
+
+            // Col Repeatation
+            static_for<WmmaInnerloop + 1 + RepeatDiff, MRepeat, 1>{}([&](auto iM) {
+                vector_type<FloatA, WmmaK> a_thread_vec;
+                vector_type<FloatB, WmmaK> b_thread_vec;
+
+                static_for<0, WmmaK, 1>{}([&](auto iK) {
+                    a_thread_vec.template AsType<FloatA>()(iK) =
+                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            make_tuple(iK / A_K1, iM, 0, 0, iK % A_K1))>{}];
+                    b_thread_vec.template AsType<FloatB>()(iK) =
+                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            make_tuple(iK / B_K1, WmmaInnerloop, 0, 0, iK % B_K1))>{}];
+                });
+                using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+
+                constexpr index_t c_offset =
+                    c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0));
+                // s_nop();
+                wmma_gemm.template Run(
+                    a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                    b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                // s_nop();
+            });
+        });
+    }
+
+    protected:
+    // A[M0, M1, M2, K0 = WmmaK]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / A_K1>{}, Number<MRepeat>{}, I1, I1, Number<A_K1>{}));
+
+    // B[N0, N1, N2, K0 = WmmaK]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<WmmaK / B_K1>{}, Number<NRepeat>{}, I1, I1, Number<B_K1>{}));
+
+    // C[M, N, NumRegWMMA]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, wmma_gemm.GetRegSizePerWmma()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<WmmaK / A_K1, 1, 1, 1, A_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<WmmaK / B_K1, 1, 1, 1, B_K1>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include <cmath>
 #include <string>
+#include <sstream>

 #include "ck/stream_config.hpp"

@@ -46,6 +46,17 @@ struct BaseOperator
    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
    virtual std::string GetTypeString() const { return ""; }

+    virtual std::string GetTypeIdName() const { return typeid(*this).name(); }
+
+    virtual std::string GetTypeIdHashCode() const
+    {
+        std::ostringstream oss;
+
+        oss << std::hex << typeid(*this).hash_code();
+
+        return oss.str();
+    };
+
    virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }

    virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const

--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct DeviceBatchNormBwd : public BaseOperator
+{
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> dyStrides,
+                        const std::array<index_t, Rank> dxStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnDscaleDbiasStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormBwdPtr = std::unique_ptr<DeviceBatchNormBwd<XDataType,
+                                                                 DxDataType,
+                                                                 DyDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 DscaleDbiasDataType,
+                                                                 MeanVarDataType,
+                                                                 DyElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
@@ -13,7 +13,15 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-template <index_t Rank, index_t NumBatchNormReduceDim, typename YElementwiseOp>
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
 struct DeviceBatchNormFwd : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
@@ -40,9 +48,24 @@ struct DeviceBatchNormFwd : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

-template <index_t Rank, index_t NumBatchNormReduceDim, typename YElementwiseOp>
-using DeviceBatchNormFwdPtr =
-    std::unique_ptr<DeviceBatchNormFwd<Rank, NumBatchNormReduceDim, YElementwiseOp>>;
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormFwdPtr = std::unique_ptr<DeviceBatchNormFwd<XDataType,
+                                                                 YDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 BiasDataType,
+                                                                 MeanVarDataType,
+                                                                 YElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
@@ -13,13 +13,22 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-template <index_t Rank, index_t NumBatchNormReduceDim>
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
 struct DeviceBatchNormInfer : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
        const std::array<index_t, Rank> xyLengths,
        const std::array<index_t, Rank> xStrides,
        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
@@ -28,6 +37,7 @@ struct DeviceBatchNormInfer : public BaseOperator
        const void* bnScale,
        const void* bnBias,
        double epsilon,
+        const YElementwiseOp y_elementwise_op,
        const void* estimatedMean,
        const void* estimatedInvVariance,
        void* p_y) = 0;
@@ -35,8 +45,24 @@ struct DeviceBatchNormInfer : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

-template <index_t Rank, index_t NumBatchNormReduceDim>
-using DeviceBatchNormInferPtr = std::unique_ptr<DeviceBatchNormInfer<Rank, NumBatchNormReduceDim>>;
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormInferPtr = std::unique_ptr<DeviceBatchNormInfer<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     YElementwiseOp,
+                                                                     Rank,
+                                                                     NumBatchNormReduceDim>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp
@@ -17,7 +17,7 @@ template <typename InDataTypeTuple,
          typename OutDataTypeTuple,
          typename ElementwiseOperation,
          index_t NumDim>
-struct DeviceElementwiseBase : public BaseOperator
+struct DeviceElementwise : public BaseOperator
 {
    static constexpr int NumInput  = InDataTypeTuple::Size();
    static constexpr int NumOutput = OutDataTypeTuple::Size();
@@ -37,8 +37,8 @@ template <typename InDataTypeTuple,
          typename OutDataTypeTuple,
          typename ElementwiseOperation,
          index_t NumDim>
-using DeviceElementwiseBasePtr = std::unique_ptr<
-    DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;
+using DeviceElementwisePtr = std::unique_ptr<
+    DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
@@ -32,7 +32,7 @@ struct DeviceElementwiseNormalization : public BaseOperator
                        const std::vector<index_t> betaStrides,
                        const std::vector<index_t> yStrides,
                        const std::vector<index_t> reduceDims,
-                        AccDataType epsilon,
+                        double epsilon,
                        const std::array<const void*, NumInput> in_dev_buffers,
                        const void* p_gamma,
                        const void* p_beta,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A[M, K]
+//   input : B[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   output : H[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+//   H = layernorm(E)
+// Assume:
+//   D0, D1, ... and E have the same layout
+//   Calculate mean & variance along N dimension in layernorm(E)
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename HLayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename HDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename HElementwiseOperation>
+struct DeviceGemmMultipleDLayernorm : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_h,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t StrideA,
+                        index_t StrideB,
+                        std::array<index_t, NumDTensor> StrideDs,
+                        index_t StrideH,
+                        double epsilon,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op,
+                        HElementwiseOperation h_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+namespace {
+
+template <index_t NumDTensor>
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch() = default;
+
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                   index_t BatchStrideB,
+                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
+                                   index_t BatchStrideE)
+        : BatchStrideA_(BatchStrideA),
+          BatchStrideB_(BatchStrideB),
+          BatchStrideDs_(BatchStrideDs),
+          BatchStrideE_(BatchStrideE)
+    {
+    }
+
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumDTensor> ds_offset;
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
+        return ds_offset;
+    }
+
+    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideE_);
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    Array<ck::index_t, NumDTensor> BatchStrideDs_;
+    index_t BatchStrideE_;
+};
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_K0_M0_M1_K1,
+          typename BGridDesc_K0_N0_N1_K1,
+          typename DsGridDesc_M0_M10_M11_N0_N10_N11,
+          typename CGridDesc_M0_M10_M11_N0_N10_N11,
+          typename Block2CTileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_conv_fwd_dl_multiple_d(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const index_t batch_count,
+            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+            const Block2CTileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
+    // offset base pointer for each work-group
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
+
+    __shared__ ABDataType p_shared[shared_block_size];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor = DsGridDesc_M0_M10_M11_N0_N10_N11::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::Run(p_a_grid + a_batch_offset,
+                      p_b_grid + b_batch_offset,
+                      p_ds_grid_grp,
+                      p_e_grid + c_batch_offset,
+                      p_shared,
+                      a_element_op,
+                      b_element_op,
+                      cde_element_op,
+                      a_grid_desc_k0_m0_m1_k1,
+                      b_grid_desc_k0_n0_n1_k1,
+                      ds_grid_desc_m0_m10_m11_n0_n10_n11,
+                      e_grid_desc_m0_m10_m11_n0_n10_n11,
+                      block_2_ctile_map,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m0_m1_k1;
+    ignore = b_grid_desc_k0_n0_n1_k1;
+    ignore = ds_grid_desc_m0_m10_m11_n0_n10_n11;
+    ignore = e_grid_desc_m0_m10_m11_n0_n10_n11;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+
+    compute_ptr_offset_of_batch.GetAPtrOffset(0);
+    compute_ptr_offset_of_batch.GetBPtrOffset(0);
+    compute_ptr_offset_of_batch.GetEPtrOffset(0);
+#endif
+}
+} // namespace
+
+//
+// @brief      Device Convolution operation.
+//
+// Supports:
+//  @li         Forward convolution with up to 3 spatial dimentions
+//  @li         Input tensor in GNWC data format
+//  @li         Weight tensor in GKXC data format
+//  @li         Output tensor in GNWK data format
+//
+// 1D:
+// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+// 2D:
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+// 3D:
+// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+//
+template <index_t NDimSpatial,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ConvolutionForwardSpecialization ConvForwardSpecialization,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t K1,
+          index_t M1PerThread,
+          index_t N1PerThread,
+          index_t KPerThread,
+          typename M1N1ThreadClusterM1Xs,
+          typename M1N1ThreadClusterN1Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector>
+struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
+    : public DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                           ALayout,
+                                           BLayout,
+                                           DsLayout,
+                                           ELayout,
+                                           ADataType,
+                                           BDataType,
+                                           DsDataType,
+                                           EDataType,
+                                           AElementwiseOperation,
+                                           BElementwiseOperation,
+                                           CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvForwardSpecialization>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, K0PerBlock};
+
+    template <typename ALay>
+    static auto
+    MakeAGridDescriptor_AK0_M_AK1(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                                  const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                                  const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                                  const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                                  const std::array<index_t, NDimSpatial>& input_left_pads,
+                                  const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ALay>(a_g_n_c_wis_lengths,
+                                                                        a_g_n_c_wis_strides,
+                                                                        b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides,
+                                                                        e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides,
+                                                                        conv_filter_strides,
+                                                                        conv_filter_dilations,
+                                                                        input_left_pads,
+                                                                        input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+
+        const auto M = in_gemmm_gemmk_desc.GetLength(I0);
+        const auto K = in_gemmm_gemmk_desc.GetLength(I1);
+
+        const auto AK0 = K / K1;
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_unmerge_transform(make_tuple(AK0, K1)), make_pass_through_transform(M)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename BLay>
+    static auto
+    MakeBGridDescriptor_BK0_N_BK1(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const auto wei_gemmnraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeBDescriptor_N_K<BLay>(b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides);
+
+        const auto wei_gemmn_gemmk_desc =
+            matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc);
+
+        const auto N = wei_gemmn_gemmk_desc.GetLength(I0);
+        const auto K = wei_gemmn_gemmk_desc.GetLength(I1);
+
+        const auto BK0 = K / K1;
+
+        return transform_tensor_descriptor(
+            wei_gemmn_gemmk_desc,
+            make_tuple(make_unmerge_transform(make_tuple(BK0, K1)), make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename ELay>
+    static auto
+    MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides)
+    {
+        const auto out_gemmmraw_gemmnraw_desc =
+            conv_to_gemm_transformer.template MakeCDescriptor_M_N<ELay>(e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides);
+
+        const auto out_gemmm_gemmn_desc =
+            matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(ds_g_n_k_wos_lengths[i],
+                                                                  ds_g_n_k_wos_strides[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    // desc for problem definition
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        MakeAGridDescriptor_AK0_M_AK1<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+    using BGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1<BLayout>({}, {}))>;
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}))>;
+    using EGridDesc_M_N  = remove_cvref_t<decltype(MakeEGridDescriptor_M_N<ELayout>({}, {}))>;
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemmDlMultipleD_km_kn_mn<BlockSize,
+                                         ADataType,
+                                         AccDataType,
+                                         DsDataType,
+                                         EDataType,
+                                         AElementwiseOperation,
+                                         BElementwiseOperation,
+                                         CDEElementwiseOperation,
+                                         InMemoryDataOperationEnum::Set,
+                                         AGridDesc_AK0_M_AK1,
+                                         BGridDesc_BK0_N_BK1,
+                                         EGridDesc_M_N,
+                                         MPerBlock,
+                                         NPerBlock,
+                                         K0PerBlock,
+                                         K1,
+                                         M1PerThread,
+                                         N1PerThread,
+                                         KPerThread,
+                                         M1N1ThreadClusterM1Xs,
+                                         M1N1ThreadClusterN1Xs,
+                                         ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                         ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                         ABlockTransferThreadClusterArrangeOrder,
+                                         ABlockTransferSrcAccessOrder,
+                                         ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                         ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                         ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                         BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                         BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                         BBlockTransferThreadClusterArrangeOrder,
+                                         BBlockTransferSrcAccessOrder,
+                                         BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                         BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                         BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                         CThreadTransferSrcDstAccessOrder,
+                                         CThreadTransferSrcDstVectorDim,
+                                         CThreadTransferDstScalarPerVector>;
+
+    using AGridDesc_K0_M0_M1_K1 =
+        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_AK0_M_AK1{}));
+    using BGridDesc_K0_N0_N1_K1 =
+        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_BK0_N_BK1{}));
+    using DsGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeDsGridDescriptor_M0_M10_M11_N0_N10_N11(DsGridDesc_M_N{}));
+    using CGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(EGridDesc_M_N{}));
+    using DefaultBlock2CTileMap =
+        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(EGridDesc_M_N{}));
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,
+                 const void* p_b,
+                 const std::array<const void*, NumDTensor>& p_ds,
+                 void* p_e,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOperation& a_element_op,
+                 const BElementwiseOperation& b_element_op,
+                 const CDEElementwiseOperation& cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e)},
+              num_group_{a_g_n_c_wis_lengths[0]},
+              a_grid_desc_ak0_m_ak1_{
+                  DeviceOp::MakeAGridDescriptor_AK0_M_AK1<ALayout>(a_g_n_c_wis_lengths,
+                                                                   a_g_n_c_wis_strides,
+                                                                   b_g_k_c_xs_lengths,
+                                                                   b_g_k_c_xs_strides,
+                                                                   e_g_n_k_wos_lengths,
+                                                                   e_g_n_k_wos_strides,
+                                                                   conv_filter_strides,
+                                                                   conv_filter_dilations,
+                                                                   input_left_pads,
+                                                                   input_right_pads)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1<BLayout>(
+                  b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(e_g_n_k_wos_lengths,
+                                                                          e_g_n_k_wos_strides)},
+              a_grid_desc_k0_m0_m1_k1_{},
+              b_grid_desc_k0_n0_n1_k1_{},
+              ds_grid_desc_m0_m10_m11_n0_n10_n11_{},
+              e_grid_desc_m0_m10_m11_n0_n10_n11_{},
+              block_2_ctile_map_{},
+              compute_ptr_offset_of_batch_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
+              a_g_n_c_wis_strides_{a_g_n_c_wis_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
+              e_g_n_k_wos_strides_{e_g_n_k_wos_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // A/B/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
+
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+
+                // D batch stride
+                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
+
+                // D desc
+                ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N<DLayout>(
+                    ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_ak0_m_ak1_, b_grid_desc_bk0_n_bk1_, e_grid_desc_m_n_))
+            {
+
+                a_grid_desc_k0_m0_m1_k1_ =
+                    GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_ak0_m_ak1_);
+                b_grid_desc_k0_n0_n1_k1_ =
+                    GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_bk0_n_bk1_);
+                e_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(e_grid_desc_m_n_);
+
+                ds_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeDsGridDescriptor_M0_M10_M11_N0_N10_N11(ds_grid_desc_m_n_);
+
+                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[K0, M, K1]: " << a_grid_desc_ak0_m_ak1_ << std::endl;
+            std::cout << "B[K0, N, K1]: " << b_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+            std::cout << "num_group: " << num_group_ << std::endl;
+
+            std::cout << "A[k0, m0, m1, k1]: " << a_grid_desc_k0_m0_m1_k1_ << std::endl;
+            std::cout << "B[k0, n0, n1, k1]: " << b_grid_desc_k0_n0_n1_k1_ << std::endl;
+            std::cout << "A[m0, m10, m11, n0, n10, n11]: " << e_grid_desc_m0_m10_m11_n0_n10_n11_
+                      << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        index_t num_group_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
+        BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
+        DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11_;
+        CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11_;
+
+        // block-to-e-tile map
+        DefaultBlock2CTileMap block_2_ctile_map_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config)
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.e_grid_desc_m_n_))
+            {
+                throw std::runtime_error(
+                    "wrong! DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK has invalid setting");
+            }
+
+            const index_t grid_size =
+                GridwiseGemm::CalculateGridSize(arg.e_grid_desc_m_n_.GetLength(I0),
+                                                arg.e_grid_desc_m_n_.GetLength(I1)) *
+                arg.num_group_;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop,
+                                     auto has_double_tail_k_block_loop) {
+                constexpr bool has_main_loop   = has_main_k_block_loop.value;
+                constexpr bool has_double_loop = has_double_tail_k_block_loop;
+
+                const auto kernel = kernel_grouped_conv_fwd_dl_multiple_d<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_K0_M0_M1_K1,
+                    DeviceOp::BGridDesc_K0_N0_N1_K1,
+                    DeviceOp::DsGridDesc_M0_M10_M11_N0_N10_N11,
+                    DeviceOp::CGridDesc_M0_M10_M11_N0_N10_N11,
+                    DefaultBlock2CTileMap,
+                    ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                    has_main_loop,
+                    has_double_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_g_n_c_wis_lengths_[0], // Group count
+                                              arg.a_grid_desc_k0_m0_m1_k1_,
+                                              arg.b_grid_desc_k0_n0_n1_k1_,
+                                              arg.ds_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                              arg.e_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                              arg.block_2_ctile_map_,
+                                              arg.compute_ptr_offset_of_batch_);
+            };
+
+            const auto K0                    = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+            const bool has_double_tail_k_block_loop =
+                GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+            if(has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, true>{});
+            }
+            else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, false>{});
+            }
+            else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, false>{});
+            }
+            return 0;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        namespace ctc = tensor_layout::convolution;
+
+        // check device
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        {
+            return false;
+        }
+
+        // check ConvolutionForwardSpecialization
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X          = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t ConvStride = arg.conv_filter_strides_[i];
+                const index_t LeftPad    = arg.input_left_pads_[i];
+                const index_t RightPad   = arg.input_right_pads_[i];
+
+                if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    std::cout << "Filter1x1Stride1Pad0 check: XY_index = " << i << " X = " << X
+                              << " ConvStride = " << ConvStride << " LeftPad = " << LeftPad
+                              << " RightPad = " << RightPad << std::endl;
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X        = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t LeftPad  = arg.input_left_pads_[i];
+                const index_t RightPad = arg.input_right_pads_[i];
+
+                if(!(X == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    std::cout << "Filter1x1Stride1Pad0 check: XY_index = " << i << " X = " << X
+                              << " LeftPad = " << LeftPad << " RightPad = " << RightPad
+                              << std::endl;
+                    return false;
+                }
+            }
+        }
+
+        // check vector access of A
+        // FIXME: layout
+        if constexpr(is_same_v<ALayout, ctc::G_NW_C> || is_same_v<ALayout, ctc::G_NHW_C> ||
+                     is_same_v<ALayout, ctc::G_NDHW_C> || is_same_v<ALayout, ctc::GNWC> ||
+                     is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
+                     is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
+                     is_same_v<ALayout, ctc::NDHWGC>)
+        {
+            auto srcVectorLengths = ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1{};
+            if(srcVectorLengths[I1] != 1 || srcVectorLengths[I2] != 1)
+            {
+                return false;
+            }
+            if(K1 % srcVectorLengths[I3] != 0 || K0PerBlock % srcVectorLengths[I0] != 0)
+            {
+                return false;
+            }
+
+            const index_t C = arg.a_g_n_c_wis_lengths_[2];
+
+            if(C % (srcVectorLengths[I0] * srcVectorLengths[I3]) != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
+                     is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
+                     is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
+                     is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
+                     is_same_v<BLayout, ctc::KZYXGC>)
+
+        {
+            auto srcVectorLengths = BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1{};
+            if(srcVectorLengths[I1] != 1 || srcVectorLengths[I2] != 1)
+            {
+                return false;
+            }
+            if(K1 % srcVectorLengths[I3] != 0 || K0PerBlock % srcVectorLengths[I0] != 0)
+            {
+                return false;
+            }
+
+            const index_t C = arg.b_g_k_c_xs_lengths_[2];
+
+            if(C % (srcVectorLengths[I0] * srcVectorLengths[I3]) != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(is_same_v<ELayout, ctc::G_NW_K> || is_same_v<ELayout, ctc::G_NHW_K> ||
+                     is_same_v<ELayout, ctc::G_NDHW_K> || is_same_v<ELayout, ctc::GNWK> ||
+                     is_same_v<ELayout, ctc::GNHWK> || is_same_v<ELayout, ctc::GNDHWK> ||
+                     is_same_v<ELayout, ctc::NWGK> || is_same_v<ELayout, ctc::NHWGK> ||
+                     is_same_v<ELayout, ctc::NDHWGK>)
+        {
+            const index_t K = arg.e_g_n_k_wos_lengths_[2];
+
+            if(!(K % CThreadTransferDstScalarPerVector == 0 && CThreadTransferSrcDstVectorDim == 5))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+        // check Gridwise GEMM
+        return GridwiseGemm::CheckValidity(
+            arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.e_grid_desc_m_n_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_g_n_c_wis_lengths,
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        ds_g_n_k_wos_lengths,
+                        ds_g_n_k_wos_strides,
+                        e_g_n_k_wos_lengths,
+                        e_g_n_k_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_g_n_c_wis_lengths,
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          ds_g_n_k_wos_lengths,
+                                          ds_g_n_k_wos_strides,
+                                          e_g_n_k_wos_lengths,
+                                          e_g_n_k_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck