update conv example

f5e3a6e8 · Chao Liu · c9b86e0c · f5e3a6e8 · f5e3a6e8 · f5e3a6e8
Commit f5e3a6e8 authored Jul 17, 2022 by Chao Liu
12 changed files
--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
-add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
+add_example_executable(example_conv2d_fwd_bias_relu_xdl_fp16 conv2d_fwd_bias_relu_xdl_fp16.cpp)
+target_link_libraries(example_conv2d_fwd_bias_relu_xdl_fp16 PRIVATE utility)
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -16,7 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

 namespace {

@@ -32,8 +32,10 @@ using InLayout  = ck::tensor_layout::convolution::NHWC;
 using WeiLayout = ck::tensor_layout::convolution::KYXC;
 using OutLayout = ck::tensor_layout::convolution::NHWK;

-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddRelu;

 static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
@@ -82,23 +84,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
        8>;                           // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on

-using ReferenceConvFwdInstance =
-    ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
-                                                                 WeiDataType,
-                                                                 OutDataType,
-                                                                 InElementOp,
-                                                                 WeiElementOp,
-                                                                 OutElementOp>;
-
-void PrintUseMsg()
+void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: time kernel (0=n0, 1=yes)\n"
-              << "Following arguments:\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << "arg4: N spatial dimensions (default 2)\n"
+              << "Following arguments (depending on number of spatial dims):\n"
              << " N, K, C, \n"
              << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
-              << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
+              << " <in_n_hi_wi_c image spatial dimensions>, (ie Hi, Wi for 2D)\n"
              << " <strides>, (ie Sy, Sx for 2D)\n"
              << " <dilations>, (ie Dy, Dx for 2D)\n"
              << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
@@ -106,140 +101,151 @@ void PrintUseMsg()
              << std::endl;
 }

-ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[])
+ck::tensor_operation::device::ConvParams
+parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
 {
-    // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
-    int num_dim_spatial = 2;
-    int conv_args       = 3 + num_dim_spatial * 6;
-    int cmdline_nargs   = conv_args + 4;
-    if(cmdline_nargs != argc)
-    {
-        PrintUseMsg();
-        exit(0);
-    }
-
-    ck::tensor_operation::device::ConvParams params;
-    int arg_idx = 4;
+    const ck::index_t N = std::stoi(argv[arg_idx++]);
+    const ck::index_t K = std::stoi(argv[arg_idx++]);
+    const ck::index_t C = std::stoi(argv[arg_idx++]);

-    params.num_dim_spatial_ = num_dim_spatial;
-    params.N_               = std::stoi(argv[arg_idx++]);
-    params.K_               = std::stoi(argv[arg_idx++]);
-    params.C_               = std::stoi(argv[arg_idx++]);
+    std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
+    std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
+    std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
+    std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
+    std::vector<ck::index_t> input_left_pads(num_dim_spatial);
+    std::vector<ck::index_t> input_right_pads(num_dim_spatial);

-    params.filter_spatial_lengths_.resize(num_dim_spatial);
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
+        filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
    }
-    params.input_spatial_lengths_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
+        input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
    }
-    params.conv_filter_strides_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
    }
-    params.conv_filter_dilations_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
    }
-    params.input_left_pads_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
+        input_left_pads[i] = std::stoi(argv[arg_idx++]);
    }
-    params.input_right_pads_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
+        input_right_pads[i] = std::stoi(argv[arg_idx++]);
    }

-    return params;
+    return ck::tensor_operation::device::ConvParams{num_dim_spatial,
+                                                    N,
+                                                    K,
+                                                    C,
+                                                    filter_spatial_lengths,
+                                                    input_spatial_lengths,
+                                                    conv_filter_strides,
+                                                    conv_filter_dilations,
+                                                    input_left_pads,
+                                                    input_right_pads};
 }

-} // anonymous namespace
+} // namespace

 int main(int argc, char* argv[])
 {
+    print_helper_msg();
+
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    const int num_dim_spatial = 2;
+    int num_dim_spatial  = 2;

-    ck::tensor_operation::device::ConvParams params;
+    ck::tensor_operation::device::ConvParams params{
+        2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};

-    if(argc >= 4)
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
-
-    if(argc >= 5)
+    else
    {
-        params = ParseConvParams(argc, argv);
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        num_dim_spatial = std::stoi(argv[4]);
+
+        params = parse_conv_params(num_dim_spatial, 5, argv);
    }

-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
-                                        static_cast<std::size_t>(params.C_)};
-    input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths_),
-                      std::end(params.input_spatial_lengths_));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
-                                         static_cast<std::size_t>(params.C_)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths_),
-                       std::end(params.filter_spatial_lengths_));
-
-    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
-                                         static_cast<std::size_t>(params.K_)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
-    Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
-    Tensor<OutDataType> host_output(
-        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
-    Tensor<OutDataType> device_output(
-        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    auto f_nhwc_host_tensor_descriptor =
+        [](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
+            std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
+                                                  static_cast<std::size_t>(c)};
+            nhwc_lengths.insert(
+                nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
+
+            return HostTensorDescriptor(nhwc_lengths);
+        };
+
+    Tensor<InDataType> in_n_hi_wi_c(
+        f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
+    Tensor<WeiDataType> wei_k_y_x_c(
+        f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias(
+    Tensor<OutDataType> bias_k(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
+    Tensor<OutDataType> out_n_ho_wo_k_host(
+        f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
+    Tensor<OutDataType> out_n_ho_wo_k_device(
+        f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));

-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weights: " << weights.mDesc << std::endl;
-    std::cout << "output: " << host_output.mDesc << std::endl;
-    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "in_n_hi_wi_c: " << in_n_hi_wi_c.mDesc << std::endl;
+    std::cout << "wei_k_y_x_c: " << wei_k_y_x_c.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+    std::cout << "output: " << out_n_ho_wo_k_host.mDesc << std::endl;

    switch(init_method)
    {
    case 0: break;
    case 1:
-        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
        break;
    default:
-        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_ho_wo_k_device.mDesc.GetElementSpace());

-    in_device_buf.ToDevice(input.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
-    bias_device_buf.ToDevice(bias.mData.data());
+    in_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());

+    // do GEMM
    auto conv    = DeviceConvFwdInstance{};
    auto invoker = conv.MakeInvoker();
    auto argument =
@@ -252,59 +258,90 @@ int main(int argc, char* argv[])
                          params.C_,
                          params.input_spatial_lengths_,
                          params.filter_spatial_lengths_,
-                          output_spatial_lengths,
+                          params.GetOutputSpatialLengths(),
                          params.conv_filter_strides_,
                          params.conv_filter_dilations_,
                          params.input_left_pads_,
                          params.input_right_pads_,
-                          InElementOp{},
-                          WeiElementOp{},
-                          OutElementOp{});
+                          in_element_op,
+                          wei_element_op,
+                          out_element_op);

    if(!conv.IsSupportedArgument(argument))
    {
        throw std::runtime_error(
-            "wrong! device operator with the specified compilation parameters does "
-            "not support this problem");
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
    }

-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

-    std::size_t flop = get_flops(
-        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
-    std::size_t num_btype =
-        get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
-                                                        params.C_,
-                                                        params.K_,
-                                                        params.input_spatial_lengths_,
-                                                        params.filter_spatial_lengths_,
-                                                        output_spatial_lengths) +
-        sizeof(OutDataType) * (params.K_);
+    std::size_t flop      = params.GetFlops();
+    std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();

-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;

    if(do_verification)
    {
-        auto ref_conv    = ReferenceConvFwdInstance{};
-        auto ref_invoker = ref_conv.MakeInvoker();
+        // use OutDataType for intermediate data
+        Tensor<OutDataType> tmp_n_ho_wo_k_host(
+            f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
+
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<2,
+                                                         ck::tensor_layout::convolution::NHWC,
+                                                         ck::tensor_layout::convolution::KYXC,
+                                                         ck::tensor_layout::convolution::NHWK,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         PassThrough>();

-        auto ref_argument = ref_conv.MakeArgument(input,
-                                                  weights,
-                                                  host_output,
-                                                  bias,
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_hi_wi_c,
+                                                  wei_k_y_x_c,
+                                                  tmp_n_ho_wo_k_host,
                                                  params.conv_filter_strides_,
                                                  params.conv_filter_dilations_,
                                                  params.input_left_pads_,
                                                  params.input_right_pads_,
-                                                  InElementOp{},
-                                                  WeiElementOp{},
-                                                  OutElementOp{});
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+
        ref_invoker.Run(ref_argument);
-        out_device_buf.FromDevice(device_output.mData.data());
-        return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
+
+        // FIXME: implement reference pointwise operation
+        for(int n = 0; n < params.N_; n++)
+        {
+            for(int ho = 0; ho < params.output_spatial_lengths_[0]; ho++)
+            {
+                for(int wo = 0; wo < params.output_spatial_lengths_[1]; wo++)
+                {
+                    for(int k = 0; k < params.K_; k++)
+                    {
+                        out_element_op(out_n_ho_wo_k_host(n, ho, wo, k),
+                                       tmp_n_ho_wo_k_host(n, ho, wo, k),
+                                       bias_k(k));
+                    }
+                }
+            }
+        }
+
+        out_device_buf.FromDevice(out_n_ho_wo_k_device.mData.data());
+
+        return ck::utils::check_err(out_n_ho_wo_k_host.mData,
+                                    out_n_ho_wo_k_device.mData,
+                                    "Error: incorrect results!",
+                                    1e-5f,
+                                    1e-4f)
+                   ? 0
+                   : 1;
    }

    return 0;

--- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
-# FIXME: should fix validation failure
-add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util)
+add_example_executable(example_conv2d_fwd_bias_relu_add_xdl_fp16 conv2d_fwd_bias_relu_add_xdl_fp16.cpp)
+target_link_libraries(example_conv2d_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)
--- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -16,7 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

 namespace {

@@ -32,8 +32,10 @@ using InLayout  = ck::tensor_layout::convolution::NHWC;
 using WeiLayout = ck::tensor_layout::convolution::KYXC;
 using OutLayout = ck::tensor_layout::convolution::NHWK;

-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;

 static constexpr auto ConvFwdDefault =
@@ -79,23 +81,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
        8>;                      // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on

-using ReferenceConvFwdInstance =
-    ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation_Add<InDataType,
-                                                                     WeiDataType,
-                                                                     OutDataType,
-                                                                     InElementOp,
-                                                                     WeiElementOp,
-                                                                     OutElementOp>;
-
-void PrintUseMsg()
+void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: time kernel (0=n0, 1=yes)\n"
-              << "Following arguments:\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << "arg4: N spatial dimensions (default 2)\n"
+              << "Following arguments (depending on number of spatial dims):\n"
              << " N, K, C, \n"
              << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
-              << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
+              << " <in_n_hi_wi_c image spatial dimensions>, (ie Hi, Wi for 2D)\n"
              << " <strides>, (ie Sy, Sx for 2D)\n"
              << " <dilations>, (ie Dy, Dx for 2D)\n"
              << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
@@ -103,154 +98,157 @@ void PrintUseMsg()
              << std::endl;
 }

-ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[])
+ck::tensor_operation::device::ConvParams
+parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
 {
-    // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
-    int num_dim_spatial = 2;
-    int conv_args       = 3 + num_dim_spatial * 6;
-    int cmdline_nargs   = conv_args + 4;
-    if(cmdline_nargs != argc)
-    {
-        PrintUseMsg();
-        exit(0);
-    }
-
-    ck::tensor_operation::device::ConvParams params;
-    int arg_idx = 4;
+    const ck::index_t N = std::stoi(argv[arg_idx++]);
+    const ck::index_t K = std::stoi(argv[arg_idx++]);
+    const ck::index_t C = std::stoi(argv[arg_idx++]);

-    params.num_dim_spatial_ = num_dim_spatial;
-    params.N_               = std::stoi(argv[arg_idx++]);
-    params.K_               = std::stoi(argv[arg_idx++]);
-    params.C_               = std::stoi(argv[arg_idx++]);
+    std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
+    std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
+    std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
+    std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
+    std::vector<ck::index_t> input_left_pads(num_dim_spatial);
+    std::vector<ck::index_t> input_right_pads(num_dim_spatial);

-    params.filter_spatial_lengths_.resize(num_dim_spatial);
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
+        filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
    }
-    params.input_spatial_lengths_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
+        input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
    }
-    params.conv_filter_strides_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
    }
-    params.conv_filter_dilations_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
    }
-    params.input_left_pads_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
+        input_left_pads[i] = std::stoi(argv[arg_idx++]);
    }
-    params.input_right_pads_.resize(num_dim_spatial);
+
    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
+        input_right_pads[i] = std::stoi(argv[arg_idx++]);
    }

-    return params;
+    return ck::tensor_operation::device::ConvParams{num_dim_spatial,
+                                                    N,
+                                                    K,
+                                                    C,
+                                                    filter_spatial_lengths,
+                                                    input_spatial_lengths,
+                                                    conv_filter_strides,
+                                                    conv_filter_dilations,
+                                                    input_left_pads,
+                                                    input_right_pads};
 }

 } // anonymous namespace

 int main(int argc, char* argv[])
 {
-    using namespace ck::utils::conv;
+    print_helper_msg();

    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    const int num_dim_spatial = 2;
+    int num_dim_spatial  = 2;

-    ck::tensor_operation::device::ConvParams params;
+    ck::tensor_operation::device::ConvParams params{
+        2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};

-    if(argc >= 4)
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
-
-    if(argc >= 5)
+    else
    {
-        params = ParseConvParams(argc, argv);
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        num_dim_spatial = std::stoi(argv[4]);
+
+        params = parse_conv_params(num_dim_spatial, 5, argv);
    }

-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
-                                        static_cast<std::size_t>(params.C_)};
-    input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths_),
-                      std::end(params.input_spatial_lengths_));
-
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
-                                         static_cast<std::size_t>(params.C_)};
-    filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths_),
-                       std::end(params.filter_spatial_lengths_));
-
-    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
-                                         static_cast<std::size_t>(params.K_)};
-    output_dims.insert(std::end(output_dims),
-                       std::begin(output_spatial_lengths),
-                       std::end(output_spatial_lengths));
-
-    Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
-    Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
-    Tensor<OutDataType> host_output(
-        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
-    Tensor<OutDataType> device_output(
-        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};

-    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
+    auto f_nhwc_host_tensor_descriptor =
+        [](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
+            std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
+                                                  static_cast<std::size_t>(c)};
+            nhwc_lengths.insert(
+                nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());

-    // residual: assume same layout as output tensor
-    Tensor<OutDataType> residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
+            return HostTensorDescriptor(nhwc_lengths);
+        };

-    std::cout << "input: " << input.mDesc << std::endl;
-    std::cout << "weights: " << weights.mDesc << std::endl;
-    std::cout << "output: " << host_output.mDesc << std::endl;
-    std::cout << "bias: " << bias.mDesc << std::endl;
-    std::cout << "residual: " << residual.mDesc << std::endl;
+    Tensor<InDataType> in_n_hi_wi_c(
+        f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
+    Tensor<WeiDataType> wei_k_y_x_c(
+        f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
+    // bias: assume contiguous 1d vector
+    Tensor<OutDataType> bias_k(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
+    // resi: assume same layout as output tensor
+    Tensor<OutDataType> resi_n_ho_wo_k(
+        f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
+    Tensor<OutDataType> out_n_ho_wo_k_host(
+        f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
+    Tensor<OutDataType> out_n_ho_wo_k_device(
+        f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
+
+    std::cout << "in_n_hi_wi_c: " << in_n_hi_wi_c.mDesc << std::endl;
+    std::cout << "wei_k_y_x_c: " << wei_k_y_x_c.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+    std::cout << "resi_n_ho_wo_k: " << resi_n_ho_wo_k.mDesc << std::endl;
+    std::cout << "out_n_ho_wo_k: " << out_n_ho_wo_k_host.mDesc << std::endl;

    switch(init_method)
    {
    case 0: break;
    case 1:
-        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
-        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
-        residual.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        resi_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
        break;
    default:
-        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
-        residual.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        resi_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
    }

-    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
-    DeviceMem resi_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpace());
-
-    in_device_buf.ToDevice(input.mData.data());
-    wei_device_buf.ToDevice(weights.mData.data());
-    bias_device_buf.ToDevice(bias.mData.data());
-    resi_device_buf.ToDevice(residual.mData.data());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
+    DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_ho_wo_k.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_ho_wo_k_device.mDesc.GetElementSpace());

-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
+    in_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());
+    resi_device_buf.ToDevice(resi_n_ho_wo_k.mData.data());

    auto conv    = DeviceConvFwdInstance{};
    auto invoker = conv.MakeInvoker();
@@ -265,7 +263,7 @@ int main(int argc, char* argv[])
                          params.C_,
                          params.input_spatial_lengths_,
                          params.filter_spatial_lengths_,
-                          output_spatial_lengths,
+                          params.output_spatial_lengths_,
                          params.conv_filter_strides_,
                          params.conv_filter_dilations_,
                          params.input_left_pads_,
@@ -281,47 +279,75 @@ int main(int argc, char* argv[])
            "not support this problem");
    }

-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

-    std::size_t flop = get_flops(
-        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
-    std::size_t num_btype =
-        get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
-                                                        params.C_,
-                                                        params.K_,
-                                                        params.input_spatial_lengths_,
-                                                        params.filter_spatial_lengths_,
-                                                        output_spatial_lengths) +
-        sizeof(OutDataType) * (params.K_) +
-        sizeof(OutDataType) *
-            (params.N_ * params.K_ * output_spatial_lengths[0] * output_spatial_lengths[1]);
-
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
+    std::size_t flop      = params.GetFlops();
+    std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;

    if(do_verification)
    {
-        auto ref_conv    = ReferenceConvFwdInstance{};
-        auto ref_invoker = ref_conv.MakeInvoker();
+        // use OutDataType for intermediate data
+        Tensor<OutDataType> tmp_n_ho_wo_k_host(
+            f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
+
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<2,
+                                                         ck::tensor_layout::convolution::NHWC,
+                                                         ck::tensor_layout::convolution::KYXC,
+                                                         ck::tensor_layout::convolution::NHWK,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         PassThrough>();

-        auto ref_argument = ref_conv.MakeArgument(input,
-                                                  weights,
-                                                  host_output,
-                                                  bias,
-                                                  residual,
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_hi_wi_c,
+                                                  wei_k_y_x_c,
+                                                  tmp_n_ho_wo_k_host,
                                                  params.conv_filter_strides_,
                                                  params.conv_filter_dilations_,
                                                  params.input_left_pads_,
                                                  params.input_right_pads_,
                                                  in_element_op,
                                                  wei_element_op,
-                                                  out_element_op);
+                                                  PassThrough{});

        ref_invoker.Run(ref_argument);
-        out_device_buf.FromDevice(device_output.mData.data());
-        return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
+
+        // FIXME: implement reference pointwise operation
+        for(int n = 0; n < params.N_; n++)
+        {
+            for(int ho = 0; ho < params.output_spatial_lengths_[0]; ho++)
+            {
+                for(int wo = 0; wo < params.output_spatial_lengths_[1]; wo++)
+                {
+                    for(int k = 0; k < params.K_; k++)
+                    {
+                        out_element_op(out_n_ho_wo_k_host(n, ho, wo, k),
+                                       tmp_n_ho_wo_k_host(n, ho, wo, k),
+                                       bias_k(k),
+                                       resi_n_ho_wo_k(n, ho, wo, k));
+                    }
+                }
+            }
+        }
+
+        out_device_buf.FromDevice(out_n_ho_wo_k_device.mData.data());
+
+        return ck::utils::check_err(out_n_ho_wo_k_host.mData,
+                                    out_n_ho_wo_k_device.mData,
+                                    "Error: incorrect results!",
+                                    1e-5f,
+                                    1e-4f)
+                   ? 0
+                   : 1;
    }

    return 0;

--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -33,8 +33,8 @@ void print_helper_msg()
              << " <right padding>, (ie RightPy, RightPx for 2D)\n"
              << std::endl;
 }
-ck::tensor_operation::device::ConvParams

+ck::tensor_operation::device::ConvParams
 parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
 {
    const ck::index_t N = std::stoi(argv[arg_idx++]);

--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

-template <ck::index_t NumDimSpatial>
+template <ck::index_t NDimSpatial>
 using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
+    NDimSpatial,    //
    InDataType,     //
    WeiDataType,    //
    OutDataType,    //
@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
    WeiElementOp,   // Weights Elementwise Operation
    OutElementOp,   // Output Elementwise Operation
    ConvFwdDefault, // ConvForwardSpecialization
-    NumDimSpatial,  // NumDimSpatial
    256,            // BlockSize
    128,            // MPerBlock
    256,            // NPerBlock

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

-template <ck::index_t NumDimSpatial>
+template <ck::index_t NDimSpatial>
 using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
+    NDimSpatial,    // NDimSpatial
    InDataType,     //
    WeiDataType,    //
    OutDataType,    //
@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
    WeiElementOp,   // Weights Elementwise Operation
    OutElementOp,   // Output Elementwise Operation
    ConvFwdDefault, // ConvForwardSpecialization
-    NumDimSpatial,  // NumDimSpatial
    256,            // BlockSize
    256,            // MPerBlock
    128,            // NPerBlock

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

-template <ck::index_t NumDimSpatial>
+template <ck::index_t NDimSpatial>
 using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
+    NDimSpatial,    // NDimSpatial
    InDataType,     //
    WeiDataType,    //
    OutDataType,    //
@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
    WeiElementOp,   // Weights Elementwise Operation
    OutElementOp,   // Output Elementwise Operation
    ConvFwdDefault, // ConvForwardSpecialization
-    NumDimSpatial,  // NumDimSpatial
    256,            // BlockSize
    128,            // MPerBlock
    128,            // NPerBlock

--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

-template <ck::index_t NumDimSpatial>
+template <ck::index_t NDimSpatial>
 using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
+    NDimSpatial,    // NDimSpatial
    InDataType,     //
    WeiDataType,    //
    OutDataType,    //
@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
    WeiElementOp,   // Weights Elementwise Operation
    OutElementOp,   // Output Elementwise Operation
    ConvFwdDefault, // ConvForwardSpecialization
-    NumDimSpatial,  // NumDimSpatial
    256,            // BlockSize
    128,            // MPerBlock
    256,            // NPerBlock
@@ -89,6 +89,10 @@ int main(int argc, char* argv[])
        params = parse_conv_params(num_dim_spatial, 5, argv);
    }

+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
    if(num_dim_spatial == 1)
    {
        return run_conv_fwd_nhwc<1,
@@ -99,9 +103,13 @@ int main(int argc, char* argv[])
                                 InElementOp,
                                 WeiElementOp,
                                 OutElementOp,
-                                 DeviceConvNDFwdInstance<1>,
-                                 ReferenceConvNDFwdInstance<1>>(
-            params, do_verification, init_method, time_kernel);
+                                 DeviceConvNDFwdInstance<1>>(do_verification,
+                                                             init_method,
+                                                             time_kernel,
+                                                             params,
+                                                             in_element_op,
+                                                             wei_element_op,
+                                                             out_element_op);
    }
    else if(num_dim_spatial == 2)
    {
@@ -113,9 +121,13 @@ int main(int argc, char* argv[])
                                 InElementOp,
                                 WeiElementOp,
                                 OutElementOp,
-                                 DeviceConvNDFwdInstance<2>,
-                                 ReferenceConvNDFwdInstance<2>>(
-            params, do_verification, init_method, time_kernel);
+                                 DeviceConvNDFwdInstance<2>>(do_verification,
+                                                             init_method,
+                                                             time_kernel,
+                                                             params,
+                                                             in_element_op,
+                                                             wei_element_op,
+                                                             out_element_op);
    }
    else if(num_dim_spatial == 3)
    {
@@ -127,9 +139,13 @@ int main(int argc, char* argv[])
                                 InElementOp,
                                 WeiElementOp,
                                 OutElementOp,
-                                 DeviceConvNDFwdInstance<3>,
-                                 ReferenceConvNDFwdInstance<3>>(
-            params, do_verification, init_method, time_kernel);
+                                 DeviceConvNDFwdInstance<3>>(do_verification,
+                                                             init_method,
+                                                             time_kernel,
+                                                             params,
+                                                             in_element_op,
+                                                             wei_element_op,
+                                                             out_element_op);
    }

    return 0;

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -28,8 +28,6 @@ add_subdirectory(04_gemm_add_add_fastgelu)
 add_subdirectory(06_conv2d_fwd_bias_relu)
 add_subdirectory(07_conv2d_fwd_bias_relu_add)
 add_subdirectory(09_convnd_fwd)
-add_subdirectory(10_conv2d_bwd_data)
-add_subdirectory(11_conv2d_bwd_weight)
 add_subdirectory(12_reduce)
 add_subdirectory(13_pool2d_fwd)
 add_subdirectory(14_gemm_xdl_requant_relu_requant)

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
@@ -19,7 +19,7 @@ getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecializat
    switch(s)
    {
    case ConvolutionBackwardDataSpecialization::Default: return "Default";
-    case ConvolutionBackwardDataSpecialization::FFilter1x1Stride1Pad0:
+    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
        return "FFilter1x1Stride1Pad0";
    default: return "Unrecognized specialization!";
    }

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -117,12 +117,12 @@ struct Bilinear

 struct AddRelu
 {
-    template <typename T>
-    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;

    template <>
    __host__ __device__ constexpr void
-    operator()<float>(float& y, const float& x0, const float& x1) const
+    operator()<float, float, float>(float& y, const float& x0, const float& x1) const
    {
        const float a = x0 + x1;
        y             = a > 0.0f ? a : 0.0f;
@@ -130,7 +130,7 @@ struct AddRelu

    template <>
    __host__ __device__ constexpr void
-    operator()<double>(double& y, const double& x0, const double& x1) const
+    operator()<double, double, double>(double& y, const double& x0, const double& x1) const
    {
        const double a = x0 + x1;
        y              = a > 0.0 ? a : 0.0;
@@ -138,11 +138,19 @@ struct AddRelu

    template <>
    __host__ __device__ constexpr void
-    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
    {
        const half_t a = x0 + x1;
        y              = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
+    };
 };

 struct AddHardswish