Merge remote-tracking branch 'origin/develop' into bf16_int8_ckprofiler

c2b3bede · Chao Liu · ef2defdc · 0619ebf7 · c2b3bede · c2b3bede
Commit c2b3bede authored Mar 04, 2022 by Chao Liu
11 changed files
--- a/example/12_conv2d_bwd_data_xdl/README.md
+++ b/example/12_conv2d_bwd_data_xdl/README.md
+# Instructions for ```conv2d_bwd_data_xdl``` Example
+## Docker script
+```bash
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
+/bin/bash
+```
+## Build ```conv2d_bwd_data_xdl```
+```bash
+mkdir build && cd build
+```
+```bash
+# Need to specify target ID, example below is gfx908
+cmake                                                                  \
+-D BUILD_DEV=OFF                                                       \
+-D CMAKE_BUILD_TYPE=Release                                            \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
+..
+```
+```bash
+ make -j conv2d_bwd_data_xdl
+```
+## Run ```conv2d_bwd_data_xdl```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
+./bin/conv2d_bwd_data_xdl 0 1 5
+```
+Result
+```
+in_n_c_hi_wi: dim 4, lengths {128, 256, 71, 71}, strides {1290496, 1, 18176, 256}
+wei_k_c_y_x: dim 4, lengths {256, 256, 3, 3}, strides {2304, 1, 768, 256}
+out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
+arg.a_grid_desc_k0_m_k1_container_{128, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{128, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{64, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{64, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{64, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{64, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{32, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{32, 256, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 256}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 4, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {2738, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+Perf: 2.45966 ms, 79.5597 TFlops, 169.325 GB/s
+```
--- a/example/12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp
+++ b/example/12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "tensor_layout.hpp"
+#include "element_wise_operation.hpp"
+#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_bwd_data.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+using DeviceConvBwdDataInstance = ck::tensor_operation::device::
+    DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,     // InDataType
+        WeiDataType,    // WeiDataType
+        OutDataType,    // OutDataType
+        AccDataType,    // AccDataType
+        InElementOp,    // InElementwiseOperation
+        WeiElementOp,   // WeiElementwiseOperation
+        OutElementOp,   // OutElementwiseOperation
+        ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t
+        256,            // BlockSize
+        128,            // MPerBlock
+        128,            // NPerBlock
+        4,              // K0PerBlock
+        8,              // K1
+        32,             // MPerXdl
+        32,             // NPerXdl
+        2,              // MXdlPerWave
+        2,              // NXdlPerWave
+        S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+        2,              // ABlockTransferSrcVectorDim
+        8,              // ABlockTransferSrcScalarPerVector
+        8,              // ABlockTransferDstScalarPerVector_K1
+        true,           // ABlockLdsAddExtraM
+        S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<2, 0, 1>,     // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1>,     // BBlockTransferSrcAccessOrder
+        1,              // BBlockTransferSrcVectorDim
+        2,              // BBlockTransferSrcScalarPerVector
+        8,              // BBlockTransferDstScalarPerVector_K1
+        true,           // BBlockLdsAddExtraN
+        7,
+        1>; // GemmCThreadTransferDstScalarPerVector
+using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
+                                                                                  WeiDataType,
+                                                                                  OutDataType,
+                                                                                  InElementOp,
+                                                                                  WeiElementOp,
+                                                                                  OutElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+    // Conv shape
+    ck::index_t N               = 128;
+    ck::index_t K               = 256;
+    ck::index_t C               = 256;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t conv_stride_h   = 2;
+    ck::index_t conv_stride_w   = 2;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 19)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        C               = std::stoi(argv[6]);
+        Y               = std::stoi(argv[7]);
+        X               = std::stoi(argv[8]);
+        Hi              = std::stoi(argv[9]);
+        Wi              = std::stoi(argv[10]);
+        conv_stride_h   = std::stoi(argv[11]);
+        conv_stride_w   = std::stoi(argv[12]);
+        conv_dilation_h = std::stoi(argv[13]);
+        conv_dilation_w = std::stoi(argv[14]);
+        in_left_pad_h   = std::stoi(argv[15]);
+        in_left_pad_w   = std::stoi(argv[16]);
+        in_right_pad_h  = std::stoi(argv[17]);
+        in_right_pad_w  = std::stoi(argv[18]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
+    const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
+    const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+    const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+    // tensor layout
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+        };
+    Tensor<OutDataType> out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X));
+    Tensor<InDataType> in_n_c_hi_wi_host_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<InDataType> in_n_c_hi_wi_device_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) *
+                            in_n_c_hi_wi_device_result.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    // do GEMM
+    auto conv     = DeviceConvBwdDataInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      N,
+                                      K,
+                                      C,
+                                      std::vector<ck::index_t>{{Hi, Wi}},
+                                      std::vector<ck::index_t>{{Y, X}},
+                                      std::vector<ck::index_t>{{Ho, Wo}},
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      InElementOp{},
+                                      WeiElementOp{},
+                                      OutElementOp{});
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    float ave_time = invoker.Run(argument, nrepeat);
+    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                            sizeof(WeiDataType) * (K * C * Y * X) +
+                            sizeof(OutDataType) * (N * K * Ho * Wo);
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+    if(do_verification)
+    {
+        auto ref_conv    = ReferenceConvBwdInstance{};
+        auto ref_invoker = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  InElementOp{},
+                                                  WeiElementOp{},
+                                                  OutElementOp{});
+        ref_invoker.Run(ref_argument);
+        in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
+        check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
+    }
+}
--- a/example/1_gemm_xdl/gemm_xdl.cpp
+++ b/example/1_gemm_xdl/gemm_xdl.cpp
@@ -41,8 +41,7 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmDefault   = ck::tensor_operation::device::GemmSpecialization_t::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
-static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
 // clang-format off
 #if 0

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -26,6 +26,7 @@ set(GEMM_XDL_ALPHA_BETA_SOURCE 8_gemm_xdl_alpha_beta/gemm_xdl_alpha_beta.cpp)
 set(CONV2D_FWD_XDL_INT8_SOURCE 9_conv2d_fwd_xdl_int8/conv2d_fwd_xdl_int8.cpp)
 set(CONV3D_FWD_XDL_SOURCE 10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp)
 set(CONVND_FWD_XDL_SOURCE 11_convnd_fwd_xdl/convnd_fwd_xdl.cpp)
+set(CONV2D_BWD_DATA_XDL_SOURCE 12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp)
 add_executable(gemm_xdl ${GEMM_XDL_SOURCE})
 add_executable(gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE})
@@ -40,6 +41,7 @@ add_executable(gemm_xdl_alpha_beta ${GEMM_XDL_ALPHA_BETA_SOURCE})
 add_executable(conv2d_fwd_xdl_int8 ${CONV2D_FWD_XDL_INT8_SOURCE})
 add_executable(conv3d_fwd_xdl ${CONV3D_FWD_XDL_SOURCE})
 add_executable(convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
+add_executable(conv2d_bwd_data_xdl ${CONV2D_BWD_DATA_XDL_SOURCE})
 target_link_libraries(gemm_xdl PRIVATE host_tensor)
 target_link_libraries(gemm_xdl_int8 PRIVATE host_tensor)
@@ -54,3 +56,5 @@ target_link_libraries(gemm_xdl_alpha_beta PRIVATE host_tensor)
 target_link_libraries(conv2d_fwd_xdl_int8 PRIVATE host_tensor)
 target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor)
 target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor)
+target_link_libraries(conv2d_bwd_data_xdl PRIVATE host_tensor)
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -25,6 +25,7 @@ set(PROFILER_SOURCE
    src/profile_conv_fwd_bias_relu_add.cpp
    src/profile_conv_fwd_bias_relu_atomic_add.cpp
    src/profile_batched_gemm.cpp
+    src/profile_conv_bwd_data.cpp
 )
 add_executable(ckProfiler ${PROFILER_SOURCE})
@@ -39,3 +40,4 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
 target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
+target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance)
--- a/profiler/include/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profile_conv_bwd_data_impl.hpp
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_conv_bwd_data.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_conv_bwd_data.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ushort;
+using INT8 = int8_t;
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_data_instance {
+using DeviceConvBwdDataNoOpPtr =
+    DeviceConvBwdDataPtr<ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough>;
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+} // namespace device_conv2d_bwd_data_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+namespace ck {
+namespace profiler {
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_bwd_data_impl(int do_verification,
+                                int init_method,
+                                bool do_log,
+                                int nrepeat,
+                                ck::index_t N,
+                                ck::index_t K,
+                                ck::index_t C,
+                                std::vector<ck::index_t> input_spatial_lengths,
+                                std::vector<ck::index_t> filter_spatial_lengths,
+                                std::vector<ck::index_t> output_spatial_lengths,
+                                std::vector<ck::index_t> conv_filter_strides,
+                                std::vector<ck::index_t> conv_filter_dilations,
+                                std::vector<ck::index_t> input_left_pads,
+                                std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+    Tensor<InDataType> in_n_c_hi_wi_host_result(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<InDataType> in_n_c_hi_wi_device_result(
+        f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(do_verification)
+    {
+        using ReferenceConvBwdDataInstance =
+            ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
+                                                             WeiDataType,
+                                                             OutDataType,
+                                                             InElementOp,
+                                                             WeiElementOp,
+                                                             OutElementOp>;
+        auto ref_conv     = ReferenceConvBwdDataInstance{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) *
+                            in_n_c_hi_wi_device_result.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using DeviceConvBwdDataNoOpPtr =
+        ck::tensor_operation::device::DeviceConvBwdDataPtr<PassThrough, PassThrough, PassThrough>;
+    // add device Conv instances
+    std::vector<DeviceConvBwdDataNoOpPtr> conv_ptrs;
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ushort> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ushort> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ushort>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, int8_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, int8_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, int8_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
+    }
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr->GetTypeString();
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+            if(do_verification)
+            {
+                in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
+                check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", out_n_k_ho_wo.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", in_n_c_hi_wi_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", in_n_c_hi_wi_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_bwd_data_impl.hpp"
+enum ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+int profile_conv_bwd_data(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_bwd: BackwardConvolution)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_bwd_data_impl<2,
+                                                 float,
+                                                 float,
+                                                 float,
+                                                 ck::tensor_layout::convolution::NHWC,
+                                                 ck::tensor_layout::convolution::KYXC,
+                                                 ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_bwd_data_impl<2,
+                                                 ck::half_t,
+                                                 ck::half_t,
+                                                 ck::half_t,
+                                                 ck::tensor_layout::convolution::NHWC,
+                                                 ck::tensor_layout::convolution::KYXC,
+                                                 ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC &&
+            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_bwd_data_impl<2,
+                                                 uint16_t,
+                                                 uint16_t,
+                                                 uint16_t,
+                                                 ck::tensor_layout::convolution::NHWC,
+                                                 ck::tensor_layout::convolution::KYXC,
+                                                 ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC &&
+            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_bwd_data_impl<2,
+                                                 int8_t,
+                                                 int8_t,
+                                                 int8_t,
+                                                 ck::tensor_layout::convolution::NHWC,
+                                                 ck::tensor_layout::convolution::KYXC,
+                                                 ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
+    }
+    return 1;
+}
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -14,6 +14,7 @@ int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
+int profile_conv_bwd_data(int, char*[]);
 int main(int argc, char* argv[])
 {
@@ -53,6 +54,10 @@ int main(int argc, char* argv[])
    {
        return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
    }
+    else if(strcmp(argv[1], "conv_bwd") == 0)
+    {
+        return profile_conv_bwd_data(argc, argv);
+    }
    else
    {
        // clang-format off
@@ -63,7 +68,8 @@ int main(int argc, char* argv[])
               "                        conv_fwd: ForwardConvolution\n"
               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
-               "                        conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n");
+               "                        conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
+               "                        conv_bwd: BackwardConvolution\n");
        // clang-format on
        return 0;

--- a/reference_operation/include/reference_conv_bwd_data.hpp
+++ b/reference_operation/include/reference_conv_bwd_data.hpp
+#ifndef REFERENCE_CONV_BWD_DATA_HPP
+#define REFERENCE_CONV_BWD_DATA_HPP
+#include <iostream>
+#include <sstream>
+#include "device_base.hpp"
+#include "host_tensor.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace host {
+// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConvBwdData : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 const Tensor<OutDataType>& out_n_k_ho_wo,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+        Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        const Tensor<OutDataType>& out_n_k_ho_wo_;
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvBwdData::Argument;
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
+                std::size_t K = arg.wei_k_c_y_x_.mDesc.GetLengths()[0];
+                std::size_t Y = arg.wei_k_c_y_x_.mDesc.GetLengths()[2];
+                std::size_t X = arg.wei_k_c_y_x_.mDesc.GetLengths()[3];
+                std::size_t Ho = arg.out_n_k_ho_wo_.mDesc.GetLengths()[2];
+                std::size_t Wo = arg.out_n_k_ho_wo_.mDesc.GetLengths()[3];
+                float v_acc = 0;
+                for(int y = 0; y < Y; ++y)
+                {
+                    int h_tmp = hi + arg.in_left_pads_[0] - y * arg.conv_dilations_[0];
+                    if(h_tmp % arg.conv_strides_[0] == 0)
+                    {
+                        int ho = h_tmp / arg.conv_strides_[0];
+                        if(ho >= 0 && ho < Ho)
+                        {
+                            for(int x = 0; x < X; ++x)
+                            {
+                                int w_tmp = wi + arg.in_left_pads_[1] - x * arg.conv_dilations_[1];
+                                if(w_tmp % arg.conv_strides_[1] == 0)
+                                {
+                                    int wo = w_tmp / arg.conv_strides_[1];
+                                    if(wo >= 0 && wo < Wo)
+                                    {
+                                        for(int k = 0; k < K; ++k)
+                                        {
+                                            float v_out = 0;
+                                            float v_wei = 0;
+                                            arg.out_element_op_(
+                                                v_out,
+                                                ck::type_convert<float>(
+                                                    arg.out_n_k_ho_wo_(n, k, ho, wo)));
+                                            arg.wei_element_op_(v_wei,
+                                                                ck::type_convert<float>(
+                                                                    arg.wei_k_c_y_x_(k, c, y, x)));
+                                            v_acc += v_out * v_wei;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                float v_in;
+                arg.in_element_op_(v_in, v_acc);
+                arg.in_n_c_hi_wi_(n, c, hi, wi) = ck::type_convert<InDataType>(v_in);
+            };
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.in_n_c_hi_wi_.mDesc.GetLengths()[0],
+                                       arg.in_n_c_hi_wi_.mDesc.GetLengths()[1],
+                                       arg.in_n_c_hi_wi_.mDesc.GetLengths()[2],
+                                       arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+        float Run(const device::BaseArgument* p_arg, int) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+    static auto MakeArgument(Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             const Tensor<OutDataType>& out_n_k_ho_wo,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceConvBwdData"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/test/conv2d_bwd_data/main.cpp
+++ b/test/conv2d_bwd_data/main.cpp
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_conv.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_conv_bwd_data.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_conv_bwd_data.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ushort;
+using INT8 = int8_t;
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_data_instance {
+using DeviceConvBwdDataNoOpPtr =
+    DeviceConvBwdDataPtr<ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough>;
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+} // namespace device_conv2d_bwd_data_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+template <typename T>
+static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float max_diff = 1e-6;
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+int main(int argc, char* argv[])
+{
+    int data_type   = 0;
+    int init_method = 0;
+    // Conv shape
+    ck::index_t N               = 128;
+    ck::index_t K               = 256;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t conv_stride_h   = 2;
+    ck::index_t conv_stride_w   = 2;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+    if(argc == 3)
+    {
+        data_type   = std::stoi(argv[1]);
+        init_method = std::stoi(argv[2]);
+    }
+    else if(argc == 18)
+    {
+        data_type   = std::stoi(argv[1]);
+        init_method = std::stoi(argv[2]);
+        N               = std::stoi(argv[3]);
+        K               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        conv_stride_h   = std::stoi(argv[10]);
+        conv_stride_w   = std::stoi(argv[11]);
+        conv_dilation_h = std::stoi(argv[12]);
+        conv_dilation_w = std::stoi(argv[13]);
+        in_left_pad_h   = std::stoi(argv[14]);
+        in_left_pad_w   = std::stoi(argv[15]);
+        in_right_pad_h  = std::stoi(argv[16]);
+        in_right_pad_w  = std::stoi(argv[17]);
+    }
+    else
+    {
+        printf("arg1: data type (0=fp32 )\n");
+        printf("arg2: verification (0=no, 1=yes)\n");
+        printf("arg3: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg4: run kernel # of times (>1)\n");
+        printf("arg5 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    auto Run = [&](auto input_type, auto wei_type, auto out_type) {
+        using InDataType  = decltype(input_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+        using ReferenceConvBwdInstance =
+            ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
+                                                             WeiDataType,
+                                                             OutDataType,
+                                                             InElementOp,
+                                                             WeiElementOp,
+                                                             OutElementOp>;
+        const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+        const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+        const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+        const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+        const std::vector<ck::index_t> input_spatial_lengths{{Hi, Wi}};
+        const std::vector<ck::index_t> filter_spatial_lengths{{Y, X}};
+        const std::vector<ck::index_t> output_spatial_lengths{{Ho, Wo}};
+        const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
+        const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
+        const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+        const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+        auto f_host_tensor_descriptor =
+            [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            };
+        Tensor<OutDataType> out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo));
+        Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X));
+        Tensor<InDataType> in_n_c_hi_wi_host_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+        Tensor<InDataType> in_n_c_hi_wi_device_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+        std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl;
+        std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+        std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+            break;
+        default:
+            out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+        }
+        DeviceMem in_device_buf(sizeof(InDataType) *
+                                in_n_c_hi_wi_device_result.mDesc.GetElementSpace());
+        DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+        DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+        out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+        wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+        in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{5});
+        in_device_buf.ToDevice(in_n_c_hi_wi_device_result.mData.data());
+        // get host result
+        {
+            auto ref_conv    = ReferenceConvBwdInstance{};
+            auto ref_invoker = ref_conv.MakeInvoker();
+            auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
+                                                      wei_k_c_y_x,
+                                                      out_n_k_ho_wo,
+                                                      conv_filter_strides,
+                                                      conv_filter_dilations,
+                                                      input_left_pads,
+                                                      input_right_pads,
+                                                      InElementOp{},
+                                                      WeiElementOp{},
+                                                      OutElementOp{});
+            ref_invoker.Run(ref_argument);
+        }
+        using PassThrough              = ck::tensor_operation::element_wise::PassThrough;
+        using DeviceConvBwdDataNoOpPtr = ck::tensor_operation::device::
+            DeviceConvBwdDataPtr<PassThrough, PassThrough, PassThrough>;
+        // add device Conv instances
+        std::vector<DeviceConvBwdDataNoOpPtr> conv_ptrs;
+        if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+                     ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                     ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+        {
+            ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+        }
+        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                          ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+        {
+            ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+        }
+        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ushort> &&
+                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, ushort> &&
+                          ck::is_same_v<ck::remove_cv_t<OutDataType>, ushort>)
+        {
+            ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
+        }
+        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, int8_t> &&
+                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, int8_t> &&
+                          ck::is_same_v<ck::remove_cv_t<OutDataType>, int8_t>)
+        {
+            ck::tensor_operation::device::device_conv2d_bwd_data_instance::
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
+        }
+        if(conv_ptrs.size() <= 0)
+        {
+            throw std::runtime_error("wrong! no device Conv instance found");
+        }
+        // profile device Conv instances
+        bool success = true;
+        for(auto& conv_ptr : conv_ptrs)
+        {
+            auto argument_ptr = conv_ptr->MakeArgumentPointer(
+                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                N,
+                K,
+                C,
+                input_spatial_lengths,
+                filter_spatial_lengths,
+                output_spatial_lengths,
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads,
+                InElementOp{},
+                WeiElementOp{},
+                OutElementOp{});
+            if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+                invoker_ptr->Run(argument_ptr.get(), 1);
+                in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
+                if(!check_out(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result))
+                {
+                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
+                    success = false;
+                }
+                else
+                {
+                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
+                }
+            }
+            else
+            {
+                std::cout << "Not support Info: " << conv_ptr->GetTypeString() << std::endl;
+            }
+        }
+        if(success)
+        {
+            std::cout << "test conv2d bwd : Pass" << std::endl;
+        }
+        else
+        {
+            std::cout << "test conv2d bwd: Fail " << std::endl;
+        }
+    };
+    if(data_type == 0)
+    {
+        Run(float(), float(), F32());
+    }
+    else if(data_type == 1)
+    {
+        Run(F16(), F16(), F16());
+    }
+    else if(data_type == 2)
+    {
+        Run(BF16(), BF16(), BF16());
+    }
+    else if(data_type == 3)
+    {
+        Run(INT8(), INT8(), INT8());
+    }
+    else
+    {
+        return 1;
+    }
+    return 0;
+}
--- a/test/space_filling_curve/space_filling_curve.cpp
+++ b/test/space_filling_curve/space_filling_curve.cpp
@@ -29,9 +29,9 @@ void traverse_using_space_filling_curve()
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
-    using TensorLengths     = Sequence<4, 10, 9>;
+    using TensorLengths     = Sequence<16, 10, 9>;
    using DimAccessOrder    = Sequence<2, 0, 1>;
-    using ScalarsPerAccess  = Sequence<1, 2, 3>;
+    using ScalarsPerAccess  = Sequence<4, 2, 3>;
    using SpaceFillingCurve = SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess>;
    constexpr auto expected = make_tuple(make_tuple(0, 0, 0),
@@ -39,36 +39,36 @@ void traverse_using_space_filling_curve()
                                         make_tuple(0, 4, 0),
                                         make_tuple(0, 6, 0),
                                         make_tuple(0, 8, 0),
-                                         make_tuple(1, 8, 0),
+                                         make_tuple(4, 8, 0),
-                                         make_tuple(1, 6, 0),
+                                         make_tuple(4, 6, 0),
-                                         make_tuple(1, 4, 0),
+                                         make_tuple(4, 4, 0),
-                                         make_tuple(1, 2, 0),
+                                         make_tuple(4, 2, 0),
-                                         make_tuple(1, 0, 0),
+                                         make_tuple(4, 0, 0),
-                                         make_tuple(2, 0, 0),
+                                         make_tuple(8, 0, 0),
-                                         make_tuple(2, 2, 0),
+                                         make_tuple(8, 2, 0),
-                                         make_tuple(2, 4, 0),
+                                         make_tuple(8, 4, 0),
-                                         make_tuple(2, 6, 0),
+                                         make_tuple(8, 6, 0),
-                                         make_tuple(2, 8, 0),
+                                         make_tuple(8, 8, 0),
-                                         make_tuple(3, 8, 0),
+                                         make_tuple(12, 8, 0),
-                                         make_tuple(3, 6, 0),
+                                         make_tuple(12, 6, 0),
-                                         make_tuple(3, 4, 0),
+                                         make_tuple(12, 4, 0),
-                                         make_tuple(3, 2, 0),
+                                         make_tuple(12, 2, 0),
-                                         make_tuple(3, 0, 0),
+                                         make_tuple(12, 0, 0),
-                                         make_tuple(3, 0, 3),
+                                         make_tuple(12, 0, 3),
-                                         make_tuple(3, 2, 3),
+                                         make_tuple(12, 2, 3),
-                                         make_tuple(3, 4, 3),
+                                         make_tuple(12, 4, 3),
-                                         make_tuple(3, 6, 3),
+                                         make_tuple(12, 6, 3),
-                                         make_tuple(3, 8, 3),
+                                         make_tuple(12, 8, 3),
-                                         make_tuple(2, 8, 3),
+                                         make_tuple(8, 8, 3),
-                                         make_tuple(2, 6, 3),
+                                         make_tuple(8, 6, 3),
-                                         make_tuple(2, 4, 3),
+                                         make_tuple(8, 4, 3),
-                                         make_tuple(2, 2, 3),
+                                         make_tuple(8, 2, 3),
-                                         make_tuple(2, 0, 3),
+                                         make_tuple(8, 0, 3),
-                                         make_tuple(1, 0, 3),
+                                         make_tuple(4, 0, 3),
-                                         make_tuple(1, 2, 3),
+                                         make_tuple(4, 2, 3),
-                                         make_tuple(1, 4, 3),
+                                         make_tuple(4, 4, 3),
-                                         make_tuple(1, 6, 3),
+                                         make_tuple(4, 6, 3),
-                                         make_tuple(1, 8, 3),
+                                         make_tuple(4, 8, 3),
                                         make_tuple(0, 8, 3),
                                         make_tuple(0, 6, 3),
                                         make_tuple(0, 4, 3),
@@ -79,21 +79,21 @@ void traverse_using_space_filling_curve()
                                         make_tuple(0, 4, 6),
                                         make_tuple(0, 6, 6),
                                         make_tuple(0, 8, 6),
-                                         make_tuple(1, 8, 6),
+                                         make_tuple(4, 8, 6),
-                                         make_tuple(1, 6, 6),
+                                         make_tuple(4, 6, 6),
-                                         make_tuple(1, 4, 6),
+                                         make_tuple(4, 4, 6),
-                                         make_tuple(1, 2, 6),
+                                         make_tuple(4, 2, 6),
-                                         make_tuple(1, 0, 6),
+                                         make_tuple(4, 0, 6),
-                                         make_tuple(2, 0, 6),
+                                         make_tuple(8, 0, 6),
-                                         make_tuple(2, 2, 6),
+                                         make_tuple(8, 2, 6),
-                                         make_tuple(2, 4, 6),
+                                         make_tuple(8, 4, 6),
-                                         make_tuple(2, 6, 6),
+                                         make_tuple(8, 6, 6),
-                                         make_tuple(2, 8, 6),
+                                         make_tuple(8, 8, 6),
-                                         make_tuple(3, 8, 6),
+                                         make_tuple(12, 8, 6),
-                                         make_tuple(3, 6, 6),
+                                         make_tuple(12, 6, 6),
-                                         make_tuple(3, 4, 6),
+                                         make_tuple(12, 4, 6),
-                                         make_tuple(3, 2, 6),
+                                         make_tuple(12, 2, 6),
-                                         make_tuple(3, 0, 6));
+                                         make_tuple(12, 0, 6));
    constexpr index_t num_accesses = SpaceFillingCurve::GetNumOfAccess();