Fusion Conv+Bias+ReLU(+Add) (#62)

* fix relu * clean up * clean up * adding 1x1 conv * adding 1x1 conv * added 1x1 conv * refactor * refactor * refactor * added profiler for conv+bias+relu+add * clean up * adding conv+bias+relu * adding conv+bias+relu * added conv+bias+relu * Update README.md * update cpu verification * adding c shuffle * update static_tensor for dealing with invalid element * adding c shuffle * debugging * fix bug * convert to fp16 before shuffle * shuffle more than one M/NRepeat * clean up * remove coordinate step hack from GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 * clean up * remove coordinate step hack from all gridwise gemm xdl * clean up coordinate step hack * clean up coordinate step hack * ThreadwiseTensorSliceTransfer_v3r2 support pointwise op on both src and dst * adding output shuffle in conv+bias+relu+add * update * added conv+bias+relu+add with c shuffle * added conv+bias+relu+add with c shuffle * fix forward_sweep bugs in threadwise copy * clean up * refactor * clean up * clean up * added conv_c_shuffle+bias_relu * clean up * added conv+bias+relu+atomic_add * clean up * clean up * clean up * clean up * clean up * clean up * misc fixes; add 1x1 specialization * clean up * delete unused device op * clean up * add support for odd C value

Fusion Conv+Bias+ReLU(+Add) (#62)
* fix relu * clean up * clean up * adding 1x1 conv * adding 1x1 conv * added 1x1 conv * refactor * refactor * refactor * added profiler for conv+bias+relu+add * clean up * adding conv+bias+relu * adding conv+bias+relu * added conv+bias+relu * Update README.md * update cpu verification * adding c shuffle * update static_tensor for dealing with invalid element * adding c shuffle * debugging * fix bug * convert to fp16 before shuffle * shuffle more than one M/NRepeat * clean up * remove coordinate step hack from GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 * clean up * remove coordinate step hack from all gridwise gemm xdl * clean up coordinate step hack * clean up coordinate step hack * ThreadwiseTensorSliceTransfer_v3r2 support pointwise op on both src and dst * adding output shuffle in conv+bias+relu+add * update * added conv+bias+relu+add with c shuffle * added conv+bias+relu+add with c shuffle * fix forward_sweep bugs in threadwise copy * clean up * refactor * clean up * clean up * added conv_c_shuffle+bias_relu * clean up * added conv+bias+relu+atomic_add * clean up * clean up * clean up * clean up * clean up * clean up * misc fixes; add 1x1 specialization * clean up * delete unused device op * clean up * add support for odd C value
acbd7bd7 · Chao Liu · GitHub · a4f24233 · acbd7bd7 · acbd7bd7
Unverified Commit acbd7bd7 authored Dec 26, 2021 by Chao Liu Committed by GitHub Dec 26, 2021
8 changed files
--- a/profiler/include/profile_conv.hpp
+++ b/profiler/include/profile_conv.hpp
@@ -6,40 +6,26 @@
 #include "host_conv.hpp"
 #include "tensor_layout.hpp"
 #include "device_tensor.hpp"
-#include "device_conv.hpp"
+#include "device_conv_fwd.hpp"
-#include "device_conv_instance.hpp"
 #include "element_wise_operation.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_conv_instance {
+namespace device_conv2d_fwd_instance {
 using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
                                              ck::tensor_operation::element_wise::PassThrough,
                                              ck::tensor_operation::element_wise::PassThrough>;
-template <>
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-void add_device_conv_fwd_instance<2,
-                                  float,
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-                                  float,
-                                  float,
-                                  ck::tensor_layout::convolution::NHWC,
-                                  ck::tensor_layout::convolution::KYXC,
-                                  ck::tensor_layout::convolution::NHWK>(
-    std::vector<DeviceConvFwdNoOpPtr>&);
-template <>
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
-void add_device_conv_fwd_instance<2,
-                                  ck::half_t,
-                                  ck::half_t,
-                                  ck::half_t,
-                                  ck::tensor_layout::convolution::NHWC,
-                                  ck::tensor_layout::convolution::KYXC,
-                                  ck::tensor_layout::convolution::NHWK>(
    std::vector<DeviceConvFwdNoOpPtr>&);
-} // namespace device_conv_instance
+} // namespace device_conv2d_fwd_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
@@ -54,20 +40,20 @@ template <int NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename OutLayout>
-void profile_conv(int do_verification,
+void profile_conv_fwd_impl(int do_verification,
-                  int init_method,
+                           int init_method,
-                  bool do_log,
+                           bool do_log,
-                  int nrepeat,
+                           int nrepeat,
-                  ck::index_t N,
+                           ck::index_t N,
-                  ck::index_t K,
+                           ck::index_t K,
-                  ck::index_t C,
+                           ck::index_t C,
-                  std::vector<ck::index_t> input_spatial_lengths,
+                           std::vector<ck::index_t> input_spatial_lengths,
-                  std::vector<ck::index_t> filter_spatial_lengths,
+                           std::vector<ck::index_t> filter_spatial_lengths,
-                  std::vector<ck::index_t> output_spatial_lengths,
+                           std::vector<ck::index_t> output_spatial_lengths,
-                  std::vector<ck::index_t> conv_filter_strides,
+                           std::vector<ck::index_t> conv_filter_strides,
-                  std::vector<ck::index_t> conv_filter_dilations,
+                           std::vector<ck::index_t> conv_filter_dilations,
-                  std::vector<ck::index_t> input_left_pads,
+                           std::vector<ck::index_t> input_left_pads,
-                  std::vector<ck::index_t> input_right_pads)
+                           std::vector<ck::index_t> input_right_pads)
 {
    const ck::index_t Y = filter_spatial_lengths[0];
    const ck::index_t X = filter_spatial_lengths[1];
@@ -146,20 +132,30 @@ void profile_conv(int do_verification,
    // add device Conv instances
    std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
-    ck::tensor_operation::device::device_conv_instance::add_device_conv_fwd_instance<2,
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
-                                                                                     InDataType,
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
-                                                                                     WeiDataType,
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
-                                                                                     OutDataType,
+    {
-                                                                                     InLayout,
+        ck::tensor_operation::device::device_conv2d_fwd_instance::
-                                                                                     WeiLayout,
+            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
-                                                                                     OutLayout>(
+    }
-        conv_ptrs);
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_fwd_instance::
+            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+        ck::tensor_operation::device::device_conv2d_fwd_instance::
+            add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+    }
    if(conv_ptrs.size() <= 0)
    {
        throw std::runtime_error("wrong! no device Conv instance found");
    }
+    std::string best_conv_name;
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
@@ -189,6 +185,8 @@ void profile_conv(int do_verification,
        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            std::string conv_name = conv_ptr->GetTypeString();
            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
@@ -202,10 +200,11 @@ void profile_conv(int do_verification,
            float gb_per_sec = num_btype / 1.E6 / ave_time;
            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s" << std::endl;
+                      << " GB/s, " << conv_name << std::endl;
            if(tflops > best_tflops)
            {
+                best_conv_name  = conv_name;
                best_tflops     = tflops;
                best_ave_time   = ave_time;
                best_gb_per_sec = gb_per_sec;
@@ -235,7 +234,7 @@ void profile_conv(int do_verification,
    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s" << std::endl;
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
 }
 } // namespace profiler

--- a/profiler/include/profile_gemm.hpp
+++ b/profiler/include/profile_gemm.hpp
@@ -88,16 +88,16 @@ template <typename ADataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
-void profile_gemm(int do_verification,
+void profile_gemm_impl(int do_verification,
-                  int init_method,
+                       int init_method,
-                  bool do_log,
+                       bool do_log,
-                  int nrepeat,
+                       int nrepeat,
-                  int M,
+                       int M,
-                  int N,
+                       int N,
-                  int K,
+                       int K,
-                  int StrideA,
+                       int StrideA,
-                  int StrideB,
+                       int StrideB,
-                  int StrideC)
+                       int StrideC)
 {
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -164,6 +164,7 @@ void profile_gemm(int do_verification,
        throw std::runtime_error("wrong! no device GEMM instance found");
    }
+    std::string best_gemm_name;
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
@@ -189,9 +190,12 @@ void profile_gemm(int do_verification,
        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            std::string gemm_name = gemm_ptr->GetTypeString();
            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
            std::size_t flop = std::size_t(2) * M * N * K;
            std::size_t num_btype =
                sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + sizeof(CDataType) * M * N;
@@ -200,10 +204,11 @@ void profile_gemm(int do_verification,
            float gb_per_sec = num_btype / 1.E6 / ave_time;
            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s" << std::endl;
+                      << " GB/s, " << gemm_name << std::endl;
            if(tflops > best_tflops)
            {
+                best_gemm_name  = gemm_name;
                best_tflops     = tflops;
                best_ave_time   = ave_time;
                best_gb_per_sec = gb_per_sec;
@@ -234,7 +239,7 @@ void profile_gemm(int do_verification,
    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s" << std::endl;
+              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
 }
 } // namespace profiler

--- a/profiler/conv_profiler.cpp
+++ b/profiler/conv_profiler.cpp
@@ -4,7 +4,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
-#include "profile_conv.hpp"
+#include "profile_conv_fwd_impl.hpp"
 enum ConvDataType
 {
@@ -30,11 +30,11 @@ enum ConvOutputLayout
    NHWK, // 1
 };
-int conv_profiler(int argc, char* argv[])
+int profile_conv_fwd(int argc, char* argv[])
 {
    if(argc != 25)
    {
-        printf("arg1: tensor operation (conv: Convolution)\n");
+        printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
        printf("arg2: data type (0: fp32; 1: fp16)\n");
        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
@@ -83,13 +83,13 @@ int conv_profiler(int argc, char* argv[])
    if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
    {
-        ck::profiler::profile_conv<2,
+        ck::profiler::profile_conv_fwd_impl<2,
-                                   float,
+                                            float,
-                                   float,
+                                            float,
-                                   float,
+                                            float,
-                                   ck::tensor_layout::convolution::NHWC,
+                                            ck::tensor_layout::convolution::NHWC,
-                                   ck::tensor_layout::convolution::KYXC,
+                                            ck::tensor_layout::convolution::KYXC,
-                                   ck::tensor_layout::convolution::NHWK>(
+                                            ck::tensor_layout::convolution::NHWK>(
            do_verification,
            init_method,
            do_log,
@@ -108,13 +108,13 @@ int conv_profiler(int argc, char* argv[])
    else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
    {
-        ck::profiler::profile_conv<2,
+        ck::profiler::profile_conv_fwd_impl<2,
-                                   ck::half_t,
+                                            ck::half_t,
-                                   ck::half_t,
+                                            ck::half_t,
-                                   ck::half_t,
+                                            ck::half_t,
-                                   ck::tensor_layout::convolution::NHWC,
+                                            ck::tensor_layout::convolution::NHWC,
-                                   ck::tensor_layout::convolution::KYXC,
+                                            ck::tensor_layout::convolution::KYXC,
-                                   ck::tensor_layout::convolution::NHWK>(
+                                            ck::tensor_layout::convolution::NHWK>(
            do_verification,
            init_method,
            do_log,

--- a/profiler/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/profile_conv_fwd_bias_relu.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_fwd_bias_relu_impl.hpp"
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+int profile_conv_fwd_bias_relu(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd_bias_relu: ForwardConvolution+Bias+ReLu)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_impl<2,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::tensor_layout::convolution::NHWC,
+                                                      ck::tensor_layout::convolution::KYXC,
+                                                      ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+    return 1;
+}
--- a/profiler/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/profile_conv_fwd_bias_relu_add.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_fwd_bias_relu_add_impl.hpp"
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf(
+            "arg1: tensor operation (conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLu+Add)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_add_impl<2,
+                                                          ck::half_t,
+                                                          ck::half_t,
+                                                          ck::half_t,
+                                                          ck::tensor_layout::convolution::NHWC,
+                                                          ck::tensor_layout::convolution::KYXC,
+                                                          ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+    return 1;
+}
--- a/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_fwd_bias_relu_atomic_add_impl.hpp"
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd_bias_relu_atomic_add: "
+               "ForwardConvolution+Bias+ReLu+AtomicAdd)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_atomic_add_impl<
+            2,
+            ck::half_t,
+            ck::half_t,
+            ck::half_t,
+            ck::tensor_layout::convolution::NHWC,
+            ck::tensor_layout::convolution::KYXC,
+            ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+    return 1;
+}
--- a/profiler/profile_gemm.cpp
+++ b/profiler/profile_gemm.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_base.hpp"
+#include "device_gemm_xdl.hpp"
+#include "profile_gemm_impl.hpp"
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+enum GemmDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+int profile_gemm(int argc, char* argv[])
+{
+    if(argc != 14)
+    {
+        printf("arg1: tensor operation (gemm: GEMM)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, n] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, n] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const int layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const int nrepeat          = std::stoi(argv[7]);
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
+    }
+    return 1;
+}
--- a/profiler/profiler.cpp
+++ b/profiler/profiler.cpp
@@ -5,22 +5,42 @@
 #include <stdlib.h>
 #include <half.hpp>
-int gemm_profiler(int, char*[]);
+int profile_gemm(int, char*[]);
-int conv_profiler(int, char*[]);
+int profile_conv_fwd(int, char*[]);
+int profile_conv_fwd_bias_relu(int, char*[]);
+int profile_conv_fwd_bias_relu_add(int, char*[]);
+int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
 int main(int argc, char* argv[])
 {
    if(strcmp(argv[1], "gemm") == 0)
    {
-        return gemm_profiler(argc, argv);
+        return profile_gemm(argc, argv);
    }
-    else if(strcmp(argv[1], "conv") == 0)
+    else if(strcmp(argv[1], "conv_fwd") == 0)
    {
-        return conv_profiler(argc, argv);
+        return profile_conv_fwd(argc, argv);
+    }
+    else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
+    {
+        return profile_conv_fwd_bias_relu(argc, argv);
+    }
+    else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
+    {
+        return profile_conv_fwd_bias_relu_add(argc, argv);
+    }
+    else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0)
+    {
+        return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
    }
    else
    {
-        printf("arg1: tensor operation (gemm=GEMM, conv=Convolution)\n");
+        printf("arg1: tensor operation (gemm: GEMM;\n"
+               "                        conv_fwd: ForwardConvolution;\n"
+               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU)\n"
+               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add)\n"
+               "                        conv_fwd_bias_relu_atomic_add: "
+               "ForwardConvolution+Bias+ReLU+AtomicAdd)\n");
        return 0;
    }
 }