merge develop

9dce6851 · Jing Zhang · 3cc57101 · 5d37d7bf · 9dce6851 · 9dce6851
Commit 9dce6851 authored Mar 10, 2022 by Jing Zhang
20 changed files
--- a/example/08_conv3d_fwd/CMakeLists.txt
+++ b/example/08_conv3d_fwd/CMakeLists.txt
+add_example_executable(example_conv3d_fwd_xdl conv3d_fwd_xdl.cpp)
--- a/example/10_conv3d_fwd_xdl/README.md
+++ b/example/10_conv3d_fwd_xdl/README.md
--- a/example/10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp
+++ b/example/10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
+add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
--- a/example/11_convnd_fwd_xdl/README.md
+++ b/example/11_convnd_fwd_xdl/README.md
--- a/example/11_convnd_fwd_xdl/convnd_fwd_xdl.cpp
+++ b/example/11_convnd_fwd_xdl/convnd_fwd_xdl.cpp
@@ -2,7 +2,6 @@
 #include <iostream>
 #include <numeric>
 #include <type_traits>
-
 #include "config.hpp"
 #include "conv_utils.hpp"
 #include "device.hpp"

--- a/example/10_conv2d_bwd_data/CMakeLists.txt
+++ b/example/10_conv2d_bwd_data/CMakeLists.txt
+add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp)
--- a/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/README.md
+++ b/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/README.md
-# Instructions for ```conv_xdl_bias_relu_add``` Example
+# Instructions for ```conv2d_bwd_data_xdl``` Example

 ## Docker script
 ```bash
@@ -13,7 +13,7 @@ rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
 /bin/bash
 ```

-## Build ```conv_xdl_bias_relu_add```
+## Build ```conv2d_bwd_data_xdl```
 ```bash
 mkdir build && cd build
 ```
@@ -30,32 +30,50 @@ cmake                                                                  \
 ```

 ```bash
- make -j conv_xdl_bias_relu_add
+ make -j conv2d_bwd_data_xdl
 ```

-## Run ```conv_xdl_bias_relu_add```
+## Run ```conv2d_bwd_data_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv_xdl_bias_relu_add 0 1 5
+./bin/conv2d_bwd_data_xdl 0 1 5
 ```

-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+Result
 ```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
+in_n_c_hi_wi: dim 4, lengths {128, 256, 71, 71}, strides {1290496, 1, 18176, 256}
+wei_k_c_y_x: dim 4, lengths {256, 256, 3, 3}, strides {2304, 1, 768, 256}
 out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-bias_k: dim 1, lengths {256}, strides {1}
-resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-arg.c0_grid_desc_m_n_{ 165888, 256}
-arg.c1_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+arg.a_grid_desc_k0_m_k1_container_{128, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{128, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
 Warm up
-Start running 5 times...
-Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{64, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{64, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{64, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{64, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{32, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{32, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+Perf: 2.45966 ms, 79.5597 TFlops, 169.325 GB/s
 ```
--- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "tensor_layout.hpp"
+#include "element_wise_operation.hpp"
+#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_bwd_data.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+
+using DeviceConvBwdDataInstance = ck::tensor_operation::device::
+    DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,     // InDataType
+        WeiDataType,    // WeiDataType
+        OutDataType,    // OutDataType
+        AccDataType,    // AccDataType
+        InElementOp,    // InElementwiseOperation
+        WeiElementOp,   // WeiElementwiseOperation
+        OutElementOp,   // OutElementwiseOperation
+        ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t
+        256,            // BlockSize
+        128,            // MPerBlock
+        128,            // NPerBlock
+        4,              // K0PerBlock
+        8,              // K1
+        32,             // MPerXdl
+        32,             // NPerXdl
+        2,              // MXdlPerWave
+        2,              // NXdlPerWave
+        S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+        2,              // ABlockTransferSrcVectorDim
+        8,              // ABlockTransferSrcScalarPerVector
+        8,              // ABlockTransferDstScalarPerVector_K1
+        true,           // ABlockLdsAddExtraM
+        S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<2, 0, 1>,     // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1>,     // BBlockTransferSrcAccessOrder
+        1,              // BBlockTransferSrcVectorDim
+        2,              // BBlockTransferSrcScalarPerVector
+        8,              // BBlockTransferDstScalarPerVector_K1
+        true,           // BBlockLdsAddExtraN
+        7,
+        1>; // GemmCThreadTransferDstScalarPerVector
+
+using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
+                                                                                  WeiDataType,
+                                                                                  OutDataType,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp,
+                                                                                  OutElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+
+    // Conv shape
+    ck::index_t N               = 128;
+    ck::index_t K               = 256;
+    ck::index_t C               = 256;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t conv_stride_h   = 2;
+    ck::index_t conv_stride_w   = 2;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 19)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        C               = std::stoi(argv[6]);
+        Y               = std::stoi(argv[7]);
+        X               = std::stoi(argv[8]);
+        Hi              = std::stoi(argv[9]);
+        Wi              = std::stoi(argv[10]);
+        conv_stride_h   = std::stoi(argv[11]);
+        conv_stride_w   = std::stoi(argv[12]);
+        conv_dilation_h = std::stoi(argv[13]);
+        conv_dilation_w = std::stoi(argv[14]);
+        in_left_pad_h   = std::stoi(argv[15]);
+        in_left_pad_w   = std::stoi(argv[16]);
+        in_right_pad_h  = std::stoi(argv[17]);
+        in_right_pad_w  = std::stoi(argv[18]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
+    const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
+    const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+    const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+
+    // tensor layout
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+        };
+
+    Tensor<OutDataType> out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X));
+    Tensor<InDataType> in_n_c_hi_wi_host_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<InDataType> in_n_c_hi_wi_device_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) *
+                            in_n_c_hi_wi_device_result.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+
+    // do GEMM
+    auto conv     = DeviceConvBwdDataInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      N,
+                                      K,
+                                      C,
+                                      std::vector<ck::index_t>{{Hi, Wi}},
+                                      std::vector<ck::index_t>{{Y, X}},
+                                      std::vector<ck::index_t>{{Ho, Wo}},
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      InElementOp{},
+                                      WeiElementOp{},
+                                      OutElementOp{});
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float ave_time = invoker.Run(argument, nrepeat);
+
+    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                            sizeof(WeiDataType) * (K * C * Y * X) +
+                            sizeof(OutDataType) * (N * K * Ho * Wo);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv    = ReferenceConvBwdInstance{};
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  InElementOp{},
+                                                  WeiElementOp{},
+                                                  OutElementOp{});
+
+        ref_invoker.Run(ref_argument);
+
+        in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
+
+        check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
+    }
+}
--- a/example/11_conv2d_bwd_wgt/CMakeLists.txt
+++ b/example/11_conv2d_bwd_wgt/CMakeLists.txt
+add_example_executable(example_conv2d_bwd_wgt_xdl conv2d_bwd_wgt_xdl.cpp)
--- a/example/9_conv2d_fwd_xdl_int8/README.md
+++ b/example/9_conv2d_fwd_xdl_int8/README.md
-# Instructions for ```conv2d_fwd_xdl``` Example
+# Instructions for ```conv2d_wrw_xdl``` Example

 ## Docker script
 ```bash
@@ -13,7 +13,7 @@ rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
 /bin/bash
 ```

-## Build ```conv2d_fwd_xdl```
+## Build ```conv2d_wrw_xdl```
 ```bash
 mkdir build && cd build
 ```
@@ -30,28 +30,29 @@ cmake                                                                  \
 ```

 ```bash
- make -j conv2d_fwd_xdl
+ make -j conv2d_wrw_xdl
 ```

-## Run ```conv2d_fwd_xdl_int8```
+## Run ```conv2d_wrw_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv2d_fwd_xdl_int8 0 1 5
+#arg4: is show log (0=no, 1=yes)
+#arg5 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx, split-k
+./example/conv2d_fwd_xdl 0 1 5 0 4
 ```

-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+Result 
 ```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+in_n_c_hi_wi: dim 4, lengths {128, 1024, 14, 14}, strides {200704, 1, 14336, 1024}
+wei_k_c_y_x: dim 4, lengths {256, 1024, 3, 3}, strides {9216, 1, 3072, 1024}
+out_n_k_ho_wo: dim 4, lengths {128, 256, 6, 6}, strides {9216, 1, 1536, 256}
+arg.a_grid_desc_kbatch_k0_m_k1_{4, 144, 256, 8}
+arg.b_grid_desc_kbatch_k0_n_k1_{4, 144, 9216, 8}
+arg.c_grid_desc_m_n_{ 256, 9216}
+launch_and_time_kernel: grid_dim {576, 1, 1}, block_dim {256, 1, 1} 
 Warm up
 Start running 5 times...
-Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s
+Perf: 0.401084 ms, 54.2112 TFlops, 145.75 GB/s
 ```
--- a/example/11_conv2d_bwd_wgt/conv2d_bwd_wgt_xdl.cpp
+++ b/example/11_conv2d_bwd_wgt/conv2d_bwd_wgt_xdl.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "tensor_layout.hpp"
+#include "element_wise_operation.hpp"
+#include "device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_backward_weight.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using WeiLayout = ck::tensor_layout::convolution::KYXC;
+using OutLayout = ck::tensor_layout::convolution::NHWK;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+using DeviceConvWrWInstance = ck::tensor_operation::device::
+    DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,                       // InDataType
+        WeiDataType,                      // WeiDataType
+        OutDataType,                      // OutDataType
+        AccDataType,                      // AccDataType
+        InElementOp,                      // InElementwiseOperation
+        WeiElementOp,                     // WeiElementwiseOperation
+        OutElementOp,                     // OutElementwiseOperation
+        256,                              // BlockSize
+        128,                              // MPerBlock
+        128,                              // NPerBlock
+        4,                                // K0PerBlock
+        8,                                // K1
+        32,                               // MPerXdl
+        32,                               // NPerXdl
+        2,                                // MXdlPerWave
+        2,                                // NXdlPerWave
+        S<1, 4, 16, 4>,                   // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<0, 3, 1, 2>,                    // ABlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,                    // ABlockTransferSrcAccessOrder
+        2,                                // ABlockTransferSrcVectorDim
+        8,                                // ABlockTransferSrcScalarPerVector
+        2,                                // ABlockTransferDstScalarPerVector_K1
+        true,                             // ABlockLdsAddExtraM
+        S<1, 4, 16, 4>,                   // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<0, 3, 1, 2>,                    // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,                    // BBlockTransferSrcAccessOrder
+        2,                                // BBlockTransferSrcVectorDim
+        8,                                // BBlockTransferSrcScalarPerVector
+        2,                                // BBlockTransferDstScalarPerVector_K1
+        true,                             // BBlockLdsAddExtraN
+        1,                                // CShuffleMXdlPerWavePerShuffle
+        1,                                // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 4>,                   // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;                               // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+using ReferenceConvWrwInstance = ck::tensor_operation::host::
+    ReferenceConvWrw<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+    int do_log           = 0;
+    int split_k          = 4;
+
+    // Conv shape
+    ck::index_t N               = 128;
+    ck::index_t K               = 256;
+    ck::index_t C               = 1024;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 14;
+    ck::index_t Wi              = 14;
+    ck::index_t conv_stride_h   = 2;
+    ck::index_t conv_stride_w   = 2;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+
+    if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+        do_log          = std::stoi(argv[4]);
+        split_k         = std::stoi(argv[5]);
+    }
+    else if(argc == 21)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+        do_log          = std::stoi(argv[4]);
+        split_k         = std::stoi(argv[5]);
+
+        N               = std::stoi(argv[6]);
+        K               = std::stoi(argv[7]);
+        C               = std::stoi(argv[8]);
+        Y               = std::stoi(argv[9]);
+        X               = std::stoi(argv[10]);
+        Hi              = std::stoi(argv[11]);
+        Wi              = std::stoi(argv[12]);
+        conv_stride_h   = std::stoi(argv[13]);
+        conv_stride_w   = std::stoi(argv[14]);
+        conv_dilation_h = std::stoi(argv[15]);
+        conv_dilation_w = std::stoi(argv[16]);
+        in_left_pad_h   = std::stoi(argv[17]);
+        in_left_pad_w   = std::stoi(argv[18]);
+        in_right_pad_h  = std::stoi(argv[19]);
+        in_right_pad_w  = std::stoi(argv[20]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4: is show log (0=no, 1=yes)\n");
+        printf("arg5: split-k \n");
+        printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
+    const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
+    const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+    const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+
+    // tensor layout
+    auto f_host_tensor_descriptor = [](std::size_t N_,
+                                       std::size_t C_,
+                                       std::size_t H,
+                                       std::size_t W,
+                                       auto layout) {
+        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                        std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+        }
+        else if constexpr(ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::NHWC>::value ||
+                          ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::KYXC>::value ||
+                          ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::NHWK>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+        }
+    };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x_host_result(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x_device_result(
+        f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x_host_result.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+    }
+    wei_k_c_y_x_device_result.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{0});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) *
+                             wei_k_c_y_x_device_result.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x_device_result.mData.data());
+
+    // do GEMM
+    auto conv     = DeviceConvWrWInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      N,
+                                      K,
+                                      C,
+                                      std::vector<ck::index_t>{{Hi, Wi}},
+                                      std::vector<ck::index_t>{{Y, X}},
+                                      std::vector<ck::index_t>{{Ho, Wo}},
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      InElementOp{},
+                                      WeiElementOp{},
+                                      OutElementOp{},
+                                      split_k);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cout << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+        return 1;
+    }
+
+    float ave_time = invoker.Run(argument, nrepeat);
+
+    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                            sizeof(WeiDataType) * (K * C * Y * X) +
+                            sizeof(OutDataType) * (N * K * Ho * Wo);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv    = ReferenceConvWrwInstance{};
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                  wei_k_c_y_x_host_result,
+                                                  out_n_k_ho_wo,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  InElementOp{},
+                                                  WeiElementOp{},
+                                                  OutElementOp{});
+
+        ref_invoker.Run(ref_argument);
+
+        wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
+
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "out: ", out_n_k_ho_wo.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",") << std::endl;
+            LogRangeAsType<float>(
+                std::cout << "wei_device(after): ", wei_k_c_y_x_device_result.mData, ",")
+                << std::endl;
+            LogRangeAsType<float>(std::cout << "wei_host  : ", wei_k_c_y_x_host_result.mData, ",")
+                << std::endl;
+        }
+        check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
+    }
+}
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
+add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_base.hpp"
+#include "device_reduce_blockwise.hpp"
+#include "host_reduce_util.hpp"
+#include "host_generic_reduction.hpp"
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InDataType  = half_float::half;
+using OutDataType = half_float::half;
+using AccDataType = float;
+
+using kInDataType  = ck::half_t;
+using kOutDataType = ck::half_t;
+using kAccDataType = float;
+
+constexpr int Rank = 4;
+using ReduceDims_  = ck::Sequence<0, 1, 2>;
+
+constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
+constexpr NanPropagation_t NanOpt     = NanPropagation_t::PROPAGATE_NAN;
+constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
+
+using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+using InElementwiseOperation =
+    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+using AccElementwiseOperation =
+    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
+
+using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
+                                                   kAccDataType,
+                                                   kOutDataType,
+                                                   Rank,
+                                                   ReduceDims_,
+                                                   ReduceOperation,
+                                                   InElementwiseOperation,
+                                                   AccElementwiseOperation,
+                                                   PropagateNan,
+                                                   false,
+                                                   256,
+                                                   4,
+                                                   64,
+                                                   1,
+                                                   1,
+                                                   0,
+                                                   1,
+                                                   1>;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"scales", required_argument, nullptr, 'S'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    template <typename T>
+    static T getSingleValueFromString(const std::string& valueStr)
+    {
+        std::istringstream iss(valueStr);
+
+        T ret;
+
+        iss >> ret;
+
+        return (ret);
+    };
+
+    template <typename T>
+    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
+    {
+        std::string valuesStr(cstr_values);
+
+        std::vector<T> values;
+        std::size_t pos = 0;
+        std::size_t new_pos;
+
+        new_pos = valuesStr.find(',', pos);
+        while(new_pos != std::string::npos)
+        {
+            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
+
+            T val = getSingleValueFromString<T>(sliceStr);
+
+            values.push_back(val);
+
+            pos     = new_pos + 1;
+            new_pos = valuesStr.find(',', pos);
+        };
+
+        std::string sliceStr = valuesStr.substr(pos);
+        T val                = getSingleValueFromString<T>(sliceStr);
+
+        values.push_back(val);
+
+        return (values);
+    };
+
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<float> scales;
+
+    bool do_verification = false;
+
+    int init_method = 1;
+    int nrepeat     = 5;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        unsigned int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:S:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'S':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                scales = getTypeValuesFromString<float>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        init_method = std::atoi(argv[optind++]);
+        nrepeat     = std::atoi(argv[optind]);
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        return (0);
+    };
+};
+
+template <int Rank, typename ReduceDims>
+static std::vector<int> get_reduce_dims()
+{
+    std::vector<int> resDims;
+
+    static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); });
+
+    return (resDims);
+};
+
+template <int Rank, typename ReduceDims>
+static std::vector<int> get_invariant_dims()
+{
+    std::vector<int> resDims;
+    unsigned int incFlag = 0;
+
+    static_for<0, ReduceDims::Size(), 1>{}(
+        [&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); });
+
+    for(int dim = 0; dim < Rank; dim++)
+    {
+        if(incFlag & (0x1 << dim))
+            continue;
+        resDims.push_back(dim);
+    };
+
+    return (resDims);
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck::host_reduce;
+
+    SimpleAppArgs args;
+
+    if(args.processArgs(argc, argv) < 0)
+        return (-1);
+
+    constexpr bool op_support_indices =
+        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+         ReduceOpId == ReduceTensorOp_t::AMAX);
+
+    constexpr bool NeedIndices =
+        (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
+
+    // if input is half type, no reason to use float for indiced reduction operation and must use
+    // float for non-indiced reduction operation for accuracy
+    constexpr bool invalid_reduce_1 =
+        std::is_same<InDataType, ck::half_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, float>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, ck::half_t>::value));
+
+    // if input is float type, no reason to use double for indiced reduction operation
+    constexpr bool invalid_reduce_2 =
+        std::is_same<InDataType, float>::value &&
+        (op_support_indices && !std::is_same<AccDataType, float>::value);
+
+    // indices option can only be used when it is really needed
+    constexpr bool invalid_reduce_3 =
+        (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
+
+    if constexpr(invalid_reduce)
+        std::cout << "Reduction setting is not supported, exiting!" << std::endl;
+
+    Tensor<InDataType> in(args.inLengths);
+
+    const std::vector<int> InvariantDims = get_invariant_dims<Rank, ReduceDims_>();
+    const std::vector<int> ReduceDims    = get_reduce_dims<Rank, ReduceDims_>();
+
+    std::vector<size_t> outLengths;
+
+    if(InvariantDims.empty())
+        outLengths.push_back(1);
+    else
+        for(auto dim : InvariantDims)
+            outLengths.push_back(args.inLengths[dim]);
+
+    Tensor<OutDataType> out_ref(outLengths);
+    Tensor<OutDataType> out(outLengths);
+    Tensor<int> out_indices_ref(outLengths);
+    Tensor<int> out_indices(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+    float alpha = args.scales[0];
+    float beta  = args.scales[1];
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(args.do_verification)
+    {
+        switch(args.init_method)
+        {
+        case 0:
+            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
+            break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+
+    DeviceMem out_indices_dev(indicesSizeInBytes);
+
+    if(args.do_verification)
+    {
+        ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
+            hostReduce(in.mDesc, out_ref.mDesc, InvariantDims, ReduceDims);
+
+        hostReduce.Run(
+            alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
+    };
+
+    const auto i_inLengths  = to_int_vector(args.inLengths);
+    const auto i_inStrides  = to_int_vector(inStrides);
+    const auto i_outLengths = to_int_vector(outLengths);
+    const auto i_outStrides = to_int_vector(outStrides);
+
+    auto reduce = DeviceReduceInstance{};
+
+    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths);
+
+    DeviceMem ws_dev(wsSizeInBytes);
+
+    auto argument_ptr =
+        reduce.MakeArgumentPointer(i_inLengths,
+                                   i_inStrides,
+                                   i_outLengths,
+                                   i_outStrides,
+                                   alpha,
+                                   beta,
+                                   in_dev.GetDeviceBuffer(),
+                                   out_dev.GetDeviceBuffer(),
+                                   out_indices_dev.GetDeviceBuffer(),
+                                   ws_dev.GetDeviceBuffer(),
+                                   InElementwiseOperation{static_cast<int>(reduce_total_length)},
+                                   AccElementwiseOperation{static_cast<int>(reduce_total_length)});
+
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+    };
+
+    std::string reduce_name = reduce.GetTypeString();
+
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat);
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
+                            invariant_total_length * sizeof(OutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+
+    if(args.do_verification)
+    {
+        out_dev.FromDevice(out.mData.data());
+        check_error(out_ref, out);
+
+        if(NeedIndices)
+        {
+            out_indices_dev.FromDevice(out_indices.mData.data());
+            check_indices(out_indices_ref, out_indices);
+        };
+    };
+}
--- a/example/13_pool2d_fwd/CMakeLists.txt
+++ b/example/13_pool2d_fwd/CMakeLists.txt
+add_example_executable(example_pool2d_fwd pool2d_fwd.cpp)
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_reduce_util.hpp"
+#include "device_tensor.hpp"
+#include "tensor_layout.hpp"
+#include "reduction_operator.hpp"
+#include "device_pool2d_fwd_nhwc_nhwc.hpp"
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using OutLayout = ck::tensor_layout::convolution::NHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::AVG;
+#endif
+
+static constexpr bool NeedIndices  = false;
+static constexpr bool PropagateNan = false;
+
+using DevicePoolFwdInstance =
+    ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+        InDataType,  // InDataType
+        OutDataType, // OutDataType
+        AccDataType, // AccDataType
+        ReduceOpId,
+        NeedIndices,
+        64, // BlockSize
+        64, // ReduceMThreadClusterSize
+        1,  // ReduceKThreadClusterSize
+        4,  // ReduceMThreadSliceSize
+        1,  // ReduceKThreadSliceSize
+        4>; // InSrcOutDstVectorSize
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp_t ReduceOpId,
+          bool PropagateNan,
+          bool NeedIndices>
+static void pool_host_verify(const Tensor<InDataType>& in,
+                             Tensor<OutDataType>& out,
+                             Tensor<int>& out_indices,
+                             const std::array<ck::index_t, 2>& window_spatial_lengths,
+                             const std::array<ck::index_t, 2>& window_strides,
+                             const std::array<ck::index_t, 2>& in_left_pads,
+                             const std::array<ck::index_t, 2>& /*in_right_pads*/)
+{
+    using namespace ck::host_reduce;
+
+    const int divider = window_spatial_lengths[0] * window_spatial_lengths[1];
+
+    const auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
+    const auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
+
+    if constexpr(!NeedIndices)
+    {
+        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
+
+        auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
+            auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+
+            for(int y = 0; y < window_spatial_lengths[0]; ++y)
+            {
+                int hi = ho * window_strides[0] + y - in_left_pads[0];
+                for(int x = 0; x < window_spatial_lengths[1]; ++x)
+                {
+                    int wi = wo * window_strides[1] + x - in_left_pads[1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
+
+                        PreUnaryOp(currVal);
+
+                        binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                    }
+                }
+            }
+
+            PosUnaryOp(accuVal);
+
+            out(n, c, ho, wo) = accuVal;
+        };
+
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
+
+        auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
+            auto accuVal  = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            int accuIndex = 0;
+
+            for(int y = 0; y < window_spatial_lengths[0]; ++y)
+            {
+                int hi = ho * window_strides[0] + y - in_left_pads[0];
+                for(int x = 0; x < window_spatial_lengths[1]; ++x)
+                {
+                    int wi = wo * window_strides[1] + x - in_left_pads[1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
+                        int currIndex       = y * window_spatial_lengths[1] + x;
+
+                        PreUnaryOp(currVal);
+
+                        binop_with_nan_check2<AccDataType, PropagateNan>(
+                            opReduce, accuVal, currVal, accuIndex, currIndex);
+                    }
+                }
+            }
+
+            PosUnaryOp(accuVal);
+
+            out(n, c, ho, wo)         = accuVal;
+            out_indices(n, c, ho, wo) = accuIndex;
+        };
+
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    };
+}
+
+int main(int argc, char* argv[])
+{
+    using namespace ck::host_reduce;
+
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+
+    // Pool shape
+    ck::index_t N               = 128;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 16)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+
+        N               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        window_stride_h = std::stoi(argv[10]);
+        window_stride_w = std::stoi(argv[11]);
+        in_left_pad_h   = std::stoi(argv[12]);
+        in_left_pad_w   = std::stoi(argv[13]);
+        in_right_pad_h  = std::stoi(argv[14]);
+        in_right_pad_w  = std::stoi(argv[15]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    const std::array<ck::index_t, 2> window_spatial_lengths{{Y, X}};
+    const std::array<ck::index_t, 2> window_strides{{window_stride_h, window_stride_w}};
+    const std::array<ck::index_t, 2> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+    const std::array<ck::index_t, 2> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+
+    // tensor layout
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(ck::is_same<decltype(layout),
+                                          ck::tensor_layout::convolution::NHWC>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<int> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<int> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_c_ho_wo_device.mDesc.GetElementSpace());
+    DeviceMem out_indices_device_buf(sizeof(int) *
+                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpace());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+
+    auto pool        = DevicePoolFwdInstance{};
+    auto invoker_ptr = pool.MakeInvokerPointer();
+    auto argument_ptr =
+        pool.MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                 static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                 static_cast<int*>(out_indices_device_buf.GetDeviceBuffer()),
+                                 N,
+                                 C,
+                                 std::array<ck::index_t, 2>{{Hi, Wi}},
+                                 std::array<ck::index_t, 2>{{Y, X}},
+                                 std::array<ck::index_t, 2>{{Ho, Wo}},
+                                 window_strides,
+                                 input_left_pads,
+                                 input_right_pads);
+
+    if(!pool.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+
+    std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
+
+    std::size_t num_btype =
+        sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(OutDataType) * (N * C * Ho * Wo);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        pool_host_verify<InDataType,
+                         OutDataType,
+                         AccDataType,
+                         ReduceOpId,
+                         PropagateNan,
+                         NeedIndices>(in_n_c_hi_wi,
+                                      out_n_c_ho_wo_host,
+                                      out_indices_n_c_ho_wo_host,
+                                      window_spatial_lengths,
+                                      window_strides,
+                                      input_left_pads,
+                                      input_right_pads);
+
+        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+
+        check_error(out_n_c_ho_wo_host, out_n_c_ho_wo_device);
+
+        if constexpr(NeedIndices)
+        {
+            out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
+
+            //          check_indices(out_indices_n_c_ho_wo_host, out_indices_n_c_ho_wo_device);
+        };
+    }
+}
--- a/example/14_grouped_gemm/CMakeLists.txt
+++ b/example/14_grouped_gemm/CMakeLists.txt
+add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
--- a/example/1_gemm_xdl/README.md
+++ b/example/1_gemm_xdl/README.md
--- a/example/12_grouped_gemm_xdl/grouped_gemm_xdl.cpp
+++ b/example/12_grouped_gemm_xdl/grouped_gemm_xdl.cpp
--- a/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp
+++ b/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
-#include "element_wise_operation.hpp"
-#include "convolution_utility.hpp"
-
-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using InLayout  = ck::tensor_layout::convolution::NHWC;
-using WeiLayout = ck::tensor_layout::convolution::KYXC;
-using OutLayout = ck::tensor_layout::convolution::NHWK;
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-
-static constexpr auto MemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd;
-
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
-
-// clang-format off
-using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|             Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|    GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|   DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |                |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, MemoryAtomicAdd, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1,  8, 1, 1,32>,               2>;
-// clang-format on
-
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
-                                const Tensor<TWei>& wei_k_c_y_x,
-                                Tensor<TOut>& out_n_k_ho_wo,
-                                const Tensor<TOut>& bias_k,
-                                const std::vector<ck::index_t>& conv_strides,
-                                const std::vector<ck::index_t>& conv_dilations,
-                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>& /* in_right_pads */,
-                                const InElementOp& in_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        float v_acc = 0;
-
-        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
-                    {
-                        float v_in;
-                        float v_wei;
-
-                        in_element_op(v_in, static_cast<const float>(in_n_c_hi_wi(n, c, hi, wi)));
-                        wei_element_op(v_wei, static_cast<const float>(wei_k_c_y_x(k, c, y, x)));
-
-                        v_acc += v_in * v_wei;
-                    }
-                }
-            }
-        }
-
-        float v_out;
-
-        out_element_op(v_out, v_acc, static_cast<float>(bias_k(k)));
-
-        out_n_k_ho_wo(n, k, ho, wo) += v_out;
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out_n_k_ho_wo.mDesc.GetLengths()[0],
-                               out_n_k_ho_wo.mDesc.GetLengths()[1],
-                               out_n_k_ho_wo.mDesc.GetLengths()[2],
-                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
-        std::thread::hardware_concurrency());
-}
-
-int main(int argc, char* argv[])
-{
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
-
-    // Conv shape
-    ck::index_t N               = 128;
-    ck::index_t K               = 256;
-    ck::index_t C               = 192;
-    ck::index_t Y               = 3;
-    ck::index_t X               = 3;
-    ck::index_t Hi              = 71;
-    ck::index_t Wi              = 71;
-    ck::index_t conv_stride_h   = 2;
-    ck::index_t conv_stride_w   = 2;
-    ck::index_t conv_dilation_h = 1;
-    ck::index_t conv_dilation_w = 1;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-    }
-    else if(argc == 19)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-
-        N               = std::stoi(argv[4]);
-        K               = std::stoi(argv[5]);
-        C               = std::stoi(argv[6]);
-        Y               = std::stoi(argv[7]);
-        X               = std::stoi(argv[8]);
-        Hi              = std::stoi(argv[9]);
-        Wi              = std::stoi(argv[10]);
-        conv_stride_h   = std::stoi(argv[11]);
-        conv_stride_w   = std::stoi(argv[12]);
-        conv_dilation_h = std::stoi(argv[13]);
-        conv_dilation_w = std::stoi(argv[14]);
-        in_left_pad_h   = std::stoi(argv[15]);
-        in_left_pad_w   = std::stoi(argv[16]);
-        in_right_pad_h  = std::stoi(argv[17]);
-        in_right_pad_w  = std::stoi(argv[18]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
-        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(0);
-    }
-
-    const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
-    const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
-    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
-    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
-    const auto output_spatial_lengths =
-        ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
-                                                                              {Y, X},
-                                                                              conv_filter_strides,
-                                                                              conv_filter_dilations,
-                                                                              input_left_pads,
-                                                                              input_right_pads);
-
-    const ck::index_t Ho = output_spatial_lengths[0];
-    const ck::index_t Wo = output_spatial_lengths[1];
-
-    // tensor layout
-    auto f_host_tensor_descriptor = [](std::size_t N_,
-                                       std::size_t C_,
-                                       std::size_t H,
-                                       std::size_t W,
-                                       auto layout) {
-        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
-        }
-        else if constexpr(ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::KYXC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWK>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
-        }
-    };
-
-    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
-    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_host_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_device_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-
-    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias_k(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
-
-    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        break;
-    default:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
-        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
-    }
-
-    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) *
-                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_device_buf.ToDevice(out_n_k_ho_wo_host_result.mData.data());
-    bias_device_buf.ToDevice(bias_k.mData.data());
-
-    auto conv    = DeviceConvFwdInstance{};
-    auto invoker = conv.MakeInvoker();
-    auto argument =
-        conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
-                          static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                          static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                          static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
-                          N,
-                          K,
-                          C,
-                          std::vector<ck::index_t>{Hi, Wi},
-                          std::vector<ck::index_t>{Y, X},
-                          std::vector<ck::index_t>{Ho, Wo},
-                          conv_filter_strides,
-                          conv_filter_dilations,
-                          input_left_pads,
-                          input_right_pads,
-                          InElementOp{},
-                          WeiElementOp{},
-                          OutElementOp{});
-
-    if(!conv.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device operator with the specified compilation parameters does "
-            "not support this problem");
-    }
-
-    float ave_time = invoker.Run(argument, nrepeat);
-
-    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
-
-    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
-                            sizeof(WeiDataType) * (K * C * Y * X) +
-                            sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    if(do_verification)
-    {
-        host_reference_calculation(in_n_c_hi_wi,
-                                   wei_k_c_y_x,
-                                   out_n_k_ho_wo_host_result,
-                                   bias_k,
-                                   conv_filter_strides,
-                                   conv_filter_dilations,
-                                   input_left_pads,
-                                   input_right_pads,
-                                   InElementOp{},
-                                   WeiElementOp{},
-                                   OutElementOp{});
-
-        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
-
-        check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
-    }
-}