NHWC Conv2d Bwd weight fp16 ckprofiler and test (#166)

* change backward weight name * start add bwd weight lib and profiler * change tuning paramter * change output info * add bwd weight test * change test info * using conv_util * change wgt to weight * add } * add fp32

NHWC Conv2d Bwd weight fp16 ckprofiler and test (#166)
* change backward weight name * start add bwd weight lib and profiler * change tuning paramter * change output info * add bwd weight test * change test info * using conv_util * change wgt to weight * add } * add fp32
781cacd2 · ltqin · GitHub · 82c8b9f8 · 781cacd2 · 781cacd2
Unverified Commit 781cacd2 authored Apr 05, 2022 by ltqin Committed by GitHub Apr 04, 2022
19 changed files
--- a/example/11_conv2d_bwd_weight/CMakeLists.txt
+++ b/example/11_conv2d_bwd_weight/CMakeLists.txt
+add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
--- a/example/11_conv2d_bwd_wgt/README.md
+++ b/example/11_conv2d_bwd_wgt/README.md
-# Instructions for ```example_conv2d_wrw_xdl``` Example
+# Instructions for ```example_conv2d_bwd_weight_xdl``` Example
-## Run ```example_conv2d_wrw_xdl```
+## Run ```example_conv2d_bwd_weight_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4: is show log (0=no, 1=yes)
 #arg5 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx, split-k
-./example/conv2d_fwd_xdl 0 1 5 0 4
+./bin/example_conv2d_bwd_weight_xdl 0 1 5 0 4
 ```
 Result 

--- a/example/11_conv2d_bwd_wgt/conv2d_bwd_wgt_xdl.cpp
+++ b/example/11_conv2d_bwd_wgt/conv2d_bwd_wgt_xdl.cpp
@@ -32,8 +32,8 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 // clang-format off
-using DeviceConvWrWInstance = ck::tensor_operation::device::
+using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
-    DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+    DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
        InDataType,                       // InDataType
        WeiDataType,                      // WeiDataType
        OutDataType,                      // OutDataType
@@ -70,8 +70,8 @@ using DeviceConvWrWInstance = ck::tensor_operation::device::
        8>;                               // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
-using ReferenceConvWrwInstance = ck::tensor_operation::host::
+using ReferenceConvBwdWeightInstance = ck::tensor_operation::host::
-    ReferenceConvWrw<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
+    ReferenceConvBwdWeight<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
 int main(int argc, char* argv[])
 {
@@ -211,7 +211,7 @@ int main(int argc, char* argv[])
    wei_device_buf.ToDevice(wei_k_c_y_x_device_result.mData.data());
    // do GEMM
-    auto conv     = DeviceConvWrWInstance{};
+    auto conv     = DeviceConvBwdWeightInstance{};
    auto invoker  = conv.MakeInvoker();
    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
@@ -256,7 +256,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-        auto ref_conv    = ReferenceConvWrwInstance{};
+        auto ref_conv    = ReferenceConvBwdWeightInstance{};
        auto ref_invoker = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,

--- a/example/11_conv2d_bwd_wgt/CMakeLists.txt
+++ b/example/11_conv2d_bwd_wgt/CMakeLists.txt
-add_example_executable(example_conv2d_bwd_wgt_xdl conv2d_bwd_wgt_xdl.cpp)
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -35,7 +35,7 @@ add_subdirectory(07_conv2d_fwd_bias_relu_add)
 add_subdirectory(08_conv3d_fwd)
 add_subdirectory(09_convnd_fwd)
 add_subdirectory(10_conv2d_bwd_data)
-add_subdirectory(11_conv2d_bwd_wgt)
+add_subdirectory(11_conv2d_bwd_weight)
 add_subdirectory(12_reduce)
 add_subdirectory(13_pool2d_fwd)
 add_subdirectory(14_gemm_xdl_requant_relu_requant)

--- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -52,10 +52,13 @@ template <typename InDataType,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CBlockTransferScalarPerVector_NWaveNPerXdl>
-struct DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    : public DeviceConvWrw<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
+    : public DeviceConvBwdWeight<InElementwiseOperation,
+                                 WeiElementwiseOperation,
+                                 OutElementwiseOperation>
 {
-    using DeviceOp = DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+    using DeviceOp =
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
    using ADataType = OutDataType;
    using BDataType = InDataType;
@@ -68,8 +71,6 @@ struct DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
    // TODO make A/B datatype different
    using ABDataType = InDataType;
-    static constexpr index_t NDimSpatial = 2;
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
    static constexpr auto I2 = Number<2>{};
@@ -691,7 +692,7 @@ struct DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceConv2dWrWXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+        str << "DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
            << "<"
            << BlockSize << ", "
            << MPerBlock << ", "

--- a/include/ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp
@@ -11,7 +11,7 @@ namespace device {
 template <typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
-struct DeviceConvWrw : public BaseOperator
+struct DeviceConvBwdWeight : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in,
@@ -38,8 +38,8 @@ struct DeviceConvWrw : public BaseOperator
 template <typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
-using DeviceConvWrwPtr = std::unique_ptr<
+using DeviceConvBwdWeightPtr = std::unique_ptr<
-    DeviceConvWrw<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>>;
+    DeviceConvBwdWeight<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>>;
 } // namespace device
 } // namespace tensor_operation

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
@@ -17,7 +17,7 @@ template <typename InDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
-struct ReferenceConvWrw : public device::BaseOperator
+struct ReferenceConvBwdWeight : public device::BaseOperator
 {
    // Argument
    struct Argument : public device::BaseArgument
@@ -62,7 +62,7 @@ struct ReferenceConvWrw : public device::BaseOperator
    // Invoker
    struct Invoker : public device::BaseInvoker
    {
-        using Argument = ReferenceConvWrw::Argument;
+        using Argument = ReferenceConvBwdWeight::Argument;
        float Run(const Argument& arg)
        {
@@ -163,7 +163,7 @@ struct ReferenceConvWrw : public device::BaseOperator
        auto str = std::stringstream();
        // clang-format off
-        str << "ReferenceConvFwd"
+        str << "ReferenceConvBwdWeight"
            << std::endl;
        // clang-format on

--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -39,4 +39,5 @@ add_subdirectory(conv2d_bwd_data)
 add_subdirectory(reduce)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(grouped_gemm)
+add_subdirectory(conv2d_bwd_weight)
 add_subdirectory(batched_gemm_reduce)
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
+# device_conv2d_bwd_weight_instance
+set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE
+   device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
+   device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
+)
+add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) 
+target_compile_features(device_conv2d_bwd_weight_instance PUBLIC)
+set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib) 
+clang_tidy_check(device_conv2d_bwd_weight_instance)
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_weight_instance {
+using F16 = ck::half_t;
+using F32 = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //#################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|  Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CBlockTransfer| CBlockTransfer|
+        //#################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|   Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|    ClusterLengths|ScalarPerVector|
+        //#################################################################################|       |        |        |        |   Operation|   Operation|   Operation|       |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|  MBlock_MPerBlock|   NWaveNPerXdl|
+        //#################################################################################|       |        |        |        |            |            |            |       |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                 |                |               |              |               |               |          |            |            |  NBlock_NPerBlock|               |
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,   256,   128,    4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,    true,     S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,      true,        1,            1,     S<1, 32, 1, 8>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,   256,    4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,    true,     S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,      true,        1,            1,     S<1, 32, 1, 8>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    128,   128,   128,    4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,    true,     S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,   128,    4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,    true,     S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    128,   128,    64,    4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,    true,     S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    128,    64,   128,    4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,    true,     S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,     64,    64,    64,    4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,    true,     S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,      true,        1,            1,     S<1, 16, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,    64,    4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,    true,     S<1, 4, 8,  8>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              1,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    256,    64,   128,    4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              1,    true,     S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    128,   128,    32,    4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,    true,     S<1, 4, 4,  8>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              1,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,    128,    32,   128,    4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              1,    true,     S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,      true,        1,            1,     S<1, 32, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,     64,    64,    32,    4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,    true,     S<1, 4, 4,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,      true,        1,            1,     S<1, 16, 1, 4>,        8>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32,  PassThrough, PassThrough, PassThrough,     64,    32,    64,    4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              2,    true,     S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              8,              4,      true,        1,            1,     S<1, 16, 1, 4>,        8>
+    // clang-format on
+    >;
+void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvBwdWeightPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances{});
+}
+} // namespace device_conv2d_bwd_weight_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_weight_instance {
+using F32 = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    // clang-format off
+        //#################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|  Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CBlockTransfer| CBlockTransfer|
+        //#################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|   Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|    ClusterLengths|ScalarPerVector|
+        //#################################################################################|       |        |        |        |   Operation|   Operation|   Operation|       |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|  MBlock_MPerBlock|   NWaveNPerXdl|
+        //#################################################################################|       |        |        |        |            |            |            |       |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                 |                |               |              |               |               |          |            |            |  NBlock_NPerBlock|               |
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    256,   256,   128,    4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,      true,        1,            1,     S<1, 32, 1, 8>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,   256,    4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,    true,     S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,      true,        1,            1,     S<1, 32, 1, 8>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    128,   128,   128,    4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,   128,    4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,    true,     S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    128,   128,    64,    4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    128,    64,   128,    4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,    true,     S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,     64,    64,    64,    4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,      true,        1,            1,     S<1, 16, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    256,   128,    64,    4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,    true,     S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              1,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    256,    64,   128,    4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              1,    true,     S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    128,   128,    32,    4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              1,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,    128,    32,   128,    4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              1,    true,     S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,      true,        1,            1,     S<1, 32, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,     64,    64,    32,    4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,    true,     S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,      true,        1,            1,     S<1, 16, 1, 4>,        4>,
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32,  PassThrough, PassThrough, PassThrough,     64,    32,    64,    4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              2,    true,     S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,             2,              4,              4,      true,        1,            1,     S<1, 16, 1, 4>,        4>
+    // clang-format on
+    >;
+void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvBwdWeightPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances{});
+}
+} // namespace device_conv2d_bwd_weight_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -35,6 +35,7 @@ set(PROFILER_SOURCE
    src/profile_convnd_bwd_data.cpp
    src/profile_reduce.cpp
    src/profile_grouped_gemm.cpp
+    src/profile_conv_bwd_weight.cpp
    src/profile_batched_gemm_reduce.cpp
 )
@@ -55,4 +56,5 @@ target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
 target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
+target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
 target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance)
--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_conv_backward_weight.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_conv_backward_weight.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_weight_instance {
+using DeviceConvBwdWeightNoOpPtr =
+    DeviceConvBwdWeightPtr<ck::tensor_operation::element_wise::PassThrough,
+                           ck::tensor_operation::element_wise::PassThrough,
+                           ck::tensor_operation::element_wise::PassThrough>;
+void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvBwdWeightNoOpPtr>&);
+void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvBwdWeightNoOpPtr>&);
+} // namespace device_conv2d_bwd_weight_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+namespace ck {
+namespace profiler {
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool profile_conv_bwd_weight_impl(int do_verification,
+                                  int init_method,
+                                  bool do_log,
+                                  int nrepeat,
+                                  ck::index_t N,
+                                  ck::index_t K,
+                                  ck::index_t C,
+                                  std::vector<ck::index_t> input_spatial_lengths,
+                                  std::vector<ck::index_t> filter_spatial_lengths,
+                                  std::vector<ck::index_t> output_spatial_lengths,
+                                  std::vector<ck::index_t> conv_filter_strides,
+                                  std::vector<ck::index_t> conv_filter_dilations,
+                                  std::vector<ck::index_t> input_left_pads,
+                                  std::vector<ck::index_t> input_right_pads,
+                                  ck::index_t split_k)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x_host_result(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x_device_result(
+        f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x_host_result.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        break;
+    default:
+        out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
+    }
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(do_verification)
+    {
+        using ReferenceConvBwdWeightInstance =
+            ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
+                                                               WeiDataType,
+                                                               OutDataType,
+                                                               InElementOp,
+                                                               WeiElementOp,
+                                                               OutElementOp>;
+        auto ref_conv     = ReferenceConvBwdWeightInstance{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                  wei_k_c_y_x_host_result,
+                                                  out_n_k_ho_wo,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) *
+                             wei_k_c_y_x_device_result.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using DeviceConvBwdWeightNoOpPtr =
+        ck::tensor_operation::device::DeviceConvBwdWeightPtr<PassThrough, PassThrough, PassThrough>;
+    // add device Conv instances
+    std::vector<DeviceConvBwdWeightNoOpPtr> conv_ptrs;
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_weight_instance::
+            add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_bwd_weight_instance::
+            add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+    }
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device Conv instances
+    bool pass = true;
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        // using atomic, so need to reset input
+        if(split_k > 1)
+        {
+            wei_device_buf.SetZero();
+        }
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op,
+            split_k);
+        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr->GetTypeString();
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+            if(do_verification)
+            {
+                wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
+                float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
+                if(max_error > 8)
+                {
+                    pass = false;
+                    std::cout << "Fail info:" << conv_ptr->GetTypeString() << std::endl;
+                }
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "out: ", out_n_k_ho_wo.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "wei_host  : ", wei_k_c_y_x_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "wei_device: ", wei_k_c_y_x_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+    return pass;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_bwd_weight_impl.hpp"
+enum struct ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+enum struct ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum struct ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum struct ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+int profile_conv_bwd_weight(int argc, char* argv[])
+{
+    if(argc != 26)
+    {
+        printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        printf("arg25: split k (>=1)\n");
+        exit(1);
+    }
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto in_layout       = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const auto wei_layout      = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const auto out_layout      = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    ck::index_t split_k               = std::stoi(argv[25]);
+    split_k                           = std::max(1, split_k);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_bwd_weight_impl<2,
+                                                   float,
+                                                   float,
+                                                   float,
+                                                   ck::tensor_layout::convolution::NHWC,
+                                                   ck::tensor_layout::convolution::KYXC,
+                                                   ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w},
+            split_k);
+    }
+    else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_bwd_weight_impl<2,
+                                                   ck::half_t,
+                                                   ck::half_t,
+                                                   ck::half_t,
+                                                   ck::tensor_layout::convolution::NHWC,
+                                                   ck::tensor_layout::convolution::KYXC,
+                                                   ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w},
+            split_k);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
+    }
+    return 1;
+}
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -17,6 +17,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
 int profile_convnd_bwd_data(int, char*[], int);
 int profile_reduce(int, char*[]);
+int profile_conv_bwd_weight(int, char*[]);
 int profile_batched_gemm_reduce(int, char*[]);
 int main(int argc, char* argv[])
@@ -85,7 +86,12 @@ int main(int argc, char* argv[])
    {
        return profile_reduce(argc, argv);
    }
+    else if(strcmp(argv[1], "conv2d_bwd_weight") == 0)
+    {
+        return profile_conv_bwd_weight(argc, argv);
+    }
+    else
+    {
        // clang-format off
        printf("arg1: tensor operation (gemm: GEMM\n"
               "                        gemm_bias_2d: GEMM+Bias(2D)\n"
@@ -97,12 +103,12 @@ int main(int argc, char* argv[])
               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
               "                        conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
-           "                        conv1d_bwd_data: BackwardConvolution data 1d\n"
+               "                        conv1d_bwd_data: BackwardConvolution data 1 dim\n"
-           "                        conv2d_bwd_data: BackwardConvolution data 2d\n"
+               "                        conv2d_bwd_data: BackwardConvolution data 2 dim\n"
-           "                        conv3d_bwd_data: BackwardConvolution data 3d\n"
+               "                        conv3d_bwd_data: BackwardConvolution data 3 dim\n"
-           "                        grouped_gemm: Grouped GEMM\n"
+               "                        reduce: REDUCE\n"
-           "                        reduce: Reduce\n");
+               "                        conv2d_bwd_weight: Backward Weight Convolution 2d\n");
        // clang-format on
+    }
    return 0;
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -43,3 +43,4 @@ add_subdirectory(batched_gemm_reduce)
 add_subdirectory(grouped_gemm)
 add_subdirectory(convnd_fwd)
 add_subdirectory(reduce)
+add_subdirectory(conv2d_bwd_weight)
--- a/test/conv2d_bwd_weight/CMakeLists.txt
+++ b/test/conv2d_bwd_weight/CMakeLists.txt
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/profiler/include
+    ${PROJECT_SOURCE_DIR}/external/include/half
+)
+add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
+target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor)
+target_link_libraries(test_conv2d_bwd_weight PRIVATE device_conv2d_bwd_weight_instance)
--- a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
+++ b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include <vector>
+#include "conv_utils.hpp"
+#include "profile_conv_bwd_weight_impl.hpp"
+int test_self()
+{
+    bool pass = true;
+    std::vector<ck::conv_util::ConvParams> params;
+    params.push_back({2, 128, 256, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    params.push_back({2, 128, 256, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    params.push_back({2, 128, 256, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    for(auto& param : params)
+    {
+        // f32
+        pass &= ck::profiler::profile_conv_bwd_weight_impl<2,
+                                                           float,
+                                                           float,
+                                                           float,
+                                                           ck::tensor_layout::convolution::NHWC,
+                                                           ck::tensor_layout::convolution::KYXC,
+                                                           ck::tensor_layout::convolution::NHWK>(
+            1, // do_verification,
+            1, // init_method,
+            0, // do_log,
+            1, // nrepeat,
+            param.N,
+            param.K,
+            param.C,
+            param.input_spatial_lengths,
+            param.filter_spatial_lengths,
+            param.GetOutputSpatialLengths(),
+            param.conv_filter_strides,
+            param.conv_filter_dilations,
+            param.input_left_pads,
+            param.input_right_pads,
+            2);
+        // fp16
+        pass &= ck::profiler::profile_conv_bwd_weight_impl<2,
+                                                           ck::half_t,
+                                                           ck::half_t,
+                                                           ck::half_t,
+                                                           ck::tensor_layout::convolution::NHWC,
+                                                           ck::tensor_layout::convolution::KYXC,
+                                                           ck::tensor_layout::convolution::NHWK>(
+            1, // do_verification,
+            1, // init_method,
+            0, // do_log,
+            1, // nrepeat,
+            param.N,
+            param.K,
+            param.C,
+            param.input_spatial_lengths,
+            param.filter_spatial_lengths,
+            param.GetOutputSpatialLengths(),
+            param.conv_filter_strides,
+            param.conv_filter_dilations,
+            param.input_left_pads,
+            param.input_right_pads,
+            2);
+    }
+    return pass;
+}
+int main(int argc, char* argv[])
+{
+    int data_type   = 0;
+    int init_method = 0;
+    // Conv shape
+    ck::index_t N               = 128;
+    ck::index_t K               = 256;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t conv_stride_h   = 2;
+    ck::index_t conv_stride_w   = 2;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+    ck::index_t split_k         = 1;
+    bool pass = true;
+    if(argc == 1)
+    {
+        pass = test_self();
+    }
+    else
+    {
+        if(argc == 3)
+        {
+            data_type   = std::stoi(argv[1]);
+            init_method = std::stoi(argv[2]);
+        }
+        else if(argc == 19)
+        {
+            data_type   = std::stoi(argv[1]);
+            init_method = std::stoi(argv[2]);
+            N               = std::stoi(argv[3]);
+            K               = std::stoi(argv[4]);
+            C               = std::stoi(argv[5]);
+            Y               = std::stoi(argv[6]);
+            X               = std::stoi(argv[7]);
+            Hi              = std::stoi(argv[8]);
+            Wi              = std::stoi(argv[9]);
+            conv_stride_h   = std::stoi(argv[10]);
+            conv_stride_w   = std::stoi(argv[11]);
+            conv_dilation_h = std::stoi(argv[12]);
+            conv_dilation_w = std::stoi(argv[13]);
+            in_left_pad_h   = std::stoi(argv[14]);
+            in_left_pad_w   = std::stoi(argv[15]);
+            in_right_pad_h  = std::stoi(argv[16]);
+            in_right_pad_w  = std::stoi(argv[17]);
+            split_k         = std::stoi(argv[18]);
+        }
+        else
+        {
+            printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n");
+            printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+            printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+                   "RightPx\n");
+            exit(1);
+        }
+        ck::conv_util::ConvParams param{2,
+                                        N,
+                                        K,
+                                        C,
+                                        {Y, X},
+                                        {Hi, Wi},
+                                        {conv_stride_h, conv_stride_w},
+                                        {conv_dilation_h, conv_dilation_w},
+                                        {in_left_pad_h, in_left_pad_w},
+                                        {in_right_pad_h, in_right_pad_w}};
+        if(data_type == 0)
+        {
+            pass = ck::profiler::profile_conv_bwd_weight_impl<2,
+                                                              float,
+                                                              float,
+                                                              float,
+                                                              ck::tensor_layout::convolution::NHWC,
+                                                              ck::tensor_layout::convolution::KYXC,
+                                                              ck::tensor_layout::convolution::NHWK>(
+                1,
+                init_method,
+                0,
+                1,
+                param.N,
+                param.K,
+                param.C,
+                param.input_spatial_lengths,
+                param.filter_spatial_lengths,
+                param.GetOutputSpatialLengths(),
+                param.conv_filter_strides,
+                param.conv_filter_dilations,
+                param.input_left_pads,
+                param.input_right_pads,
+                split_k);
+        }
+        else if(data_type == 1)
+        {
+            pass = ck::profiler::profile_conv_bwd_weight_impl<2,
+                                                              ck::half_t,
+                                                              ck::half_t,
+                                                              ck::half_t,
+                                                              ck::tensor_layout::convolution::NHWC,
+                                                              ck::tensor_layout::convolution::KYXC,
+                                                              ck::tensor_layout::convolution::NHWK>(
+                1,
+                init_method,
+                0,
+                1,
+                param.N,
+                param.K,
+                param.C,
+                param.input_spatial_lengths,
+                param.filter_spatial_lengths,
+                param.GetOutputSpatialLengths(),
+                param.conv_filter_strides,
+                param.conv_filter_dilations,
+                param.input_left_pads,
+                param.input_right_pads,
+                split_k);
+        }
+        else
+        {
+            std::cout << "Not support data type" << std::endl;
+            return 1;
+        }
+    }
+    if(pass)
+    {
+        std::cout << "test conv2d bwd weight : Pass" << std::endl;
+        return 0;
+    }
+    else
+    {
+        std::cout << "test conv2d bwd weight: Fail " << std::endl;
+        return -1;
+    }
+}