Merge branch 'develop' into gridwise_gemm_double_buffer

7506342c · ltqin · 26b4fe97 · acbd7bd7 · 7506342c · 7506342c
Commit 7506342c authored Jan 12, 2022 by ltqin
9 changed files
--- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_conv.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_conv_fwd_bias_activation.hpp"
+#include "element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_fwd_bias_activation_instance {
+
+using DeviceConvFwdBiasReluPtr =
+    DeviceConvFwdBiasActivationPtr<ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::AddRelu>;
+
+void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdBiasReluPtr>&);
+
+} // namespace device_conv2d_fwd_bias_activation_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+void cpu_conv_bias_relu(ck::half_t* in_ptr,
+                        ck::half_t* weight_ptr,
+                        ck::half_t* output_ptr,
+                        ck::half_t* bias_ptr,
+                        const ck::index_t N,
+                        const ck::index_t K,
+                        const ck::index_t C,
+                        const ck::index_t Y,
+                        const ck::index_t X,
+                        const ck::index_t Hi,
+                        const ck::index_t Wi,
+                        const ck::index_t Ho,
+                        const ck::index_t Wo,
+                        const ck::index_t Stride,
+                        const ck::index_t Dilation,
+                        const ck::index_t Pad)
+{
+
+    const auto in_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                      static_cast<std::size_t>(Hi),
+                                                      static_cast<std::size_t>(Wi),
+                                                      static_cast<std::size_t>(C)});
+    const auto wei_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(K),
+                                                      static_cast<std::size_t>(Y),
+                                                      static_cast<std::size_t>(X),
+                                                      static_cast<std::size_t>(C)});
+    const auto out_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                      static_cast<std::size_t>(Ho),
+                                                      static_cast<std::size_t>(Wo),
+                                                      static_cast<std::size_t>(K)});
+    const auto bias_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(K)});
+
+    auto f_k = [&](auto k) {
+        for(int n = 0; n < N; ++n)
+        {
+            for(int ho = 0; ho < Ho; ++ho)
+            {
+                for(int wo = 0; wo < Wo; ++wo)
+                {
+                    double v = 0;
+                    for(int c = 0; c < C; ++c)
+                    {
+                        for(int y = 0; y < Y; ++y)
+                        {
+                            int hi = ho * Stride + y * Dilation - Pad;
+                            for(int x = 0; x < X; ++x)
+                            {
+                                int wi = wo * Stride + x * Dilation - Pad;
+                                if(hi >= 0 && hi < Hi && wi >= 0 && wi < Wi)
+                                {
+                                    double in =
+                                        in_ptr[in_desc.GetOffsetFromMultiIndex(n, hi, wi, c)];
+                                    double wei =
+                                        weight_ptr[wei_desc.GetOffsetFromMultiIndex(k, y, x, c)];
+
+                                    v += in * wei;
+                                }
+                            }
+                        }
+                    }
+
+                    v += bias_ptr[bias_desc.GetOffsetFromMultiIndex(k)];
+
+                    v = v > 0 ? v : 0;
+
+                    output_ptr[out_desc.GetOffsetFromMultiIndex(n, ho, wo, k)] = v;
+                }
+            }
+        }
+    };
+
+    make_ParallelTensorFunctor(f_k, K)(std::thread::hardware_concurrency());
+}
+
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_fwd_bias_relu_impl(int do_verification,
+                                     int init_method,
+                                     bool do_log,
+                                     int nrepeat,
+                                     ck::index_t N,
+                                     ck::index_t K,
+                                     ck::index_t C,
+                                     std::vector<ck::index_t> input_spatial_lengths,
+                                     std::vector<ck::index_t> filter_spatial_lengths,
+                                     std::vector<ck::index_t> output_spatial_lengths,
+                                     std::vector<ck::index_t> conv_filter_strides,
+                                     std::vector<ck::index_t> conv_filter_dilations,
+                                     std::vector<ck::index_t> input_left_pads,
+                                     std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+
+    // bias: assume contiguous 1d vector
+    Tensor<OutDataType> bias_k(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+    }
+
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+    if(do_verification)
+    {
+        cpu_conv_bias_relu(in_n_c_hi_wi.mData.data(),
+                           wei_k_c_y_x.mData.data(),
+                           out_n_k_ho_wo_host_result.mData.data(),
+                           bias_k.mData.data(),
+                           N,
+                           K,
+                           C,
+                           Y,
+                           X,
+                           Hi,
+                           Wi,
+                           Ho,
+                           Wo,
+                           conv_filter_strides[0],
+                           conv_filter_dilations[0],
+                           input_left_pads[0]);
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());
+
+    using DeviceConvFwdBiasReluPtr = ck::tensor_operation::device::
+        DeviceConvFwdBiasActivationPtr<InElementOp, WeiElementOp, OutElementOp>;
+
+    // add device operator instances
+    std::vector<DeviceConvFwdBiasReluPtr> op_ptrs;
+
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_fwd_bias_activation_instance::
+            add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+    }
+
+    if(op_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            InElementOp{},
+            WeiElementOp{},
+            OutElementOp{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = op_ptr->GetTypeString();
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+            std::size_t num_btype =
+                sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
+                sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+
+                check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profile_conv.hpp
+++ b/profiler/include/profile_conv.hpp
@@ -6,40 +6,26 @@
 #include "host_conv.hpp"
 #include "tensor_layout.hpp"
 #include "device_tensor.hpp"
-#include "device_conv.hpp"
-#include "device_conv_instance.hpp"
+#include "device_conv_fwd.hpp"
 #include "element_wise_operation.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_conv_instance {
+namespace device_conv2d_fwd_instance {

 using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
                                              ck::tensor_operation::element_wise::PassThrough,
                                              ck::tensor_operation::element_wise::PassThrough>;

-template <>
-void add_device_conv_fwd_instance<2,
-                                  float,
-                                  float,
-                                  float,
-                                  ck::tensor_layout::convolution::NHWC,
-                                  ck::tensor_layout::convolution::KYXC,
-                                  ck::tensor_layout::convolution::NHWK>(
-    std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);

-template <>
-void add_device_conv_fwd_instance<2,
-                                  ck::half_t,
-                                  ck::half_t,
-                                  ck::half_t,
-                                  ck::tensor_layout::convolution::NHWC,
-                                  ck::tensor_layout::convolution::KYXC,
-                                  ck::tensor_layout::convolution::NHWK>(
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
    std::vector<DeviceConvFwdNoOpPtr>&);

-} // namespace device_conv_instance
+} // namespace device_conv2d_fwd_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
@@ -54,20 +40,20 @@ template <int NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename OutLayout>
-void profile_conv(int do_verification,
-                  int init_method,
-                  bool do_log,
-                  int nrepeat,
-                  ck::index_t N,
-                  ck::index_t K,
-                  ck::index_t C,
-                  std::vector<ck::index_t> input_spatial_lengths,
-                  std::vector<ck::index_t> filter_spatial_lengths,
-                  std::vector<ck::index_t> output_spatial_lengths,
-                  std::vector<ck::index_t> conv_filter_strides,
-                  std::vector<ck::index_t> conv_filter_dilations,
-                  std::vector<ck::index_t> input_left_pads,
-                  std::vector<ck::index_t> input_right_pads)
+void profile_conv_fwd_impl(int do_verification,
+                           int init_method,
+                           bool do_log,
+                           int nrepeat,
+                           ck::index_t N,
+                           ck::index_t K,
+                           ck::index_t C,
+                           std::vector<ck::index_t> input_spatial_lengths,
+                           std::vector<ck::index_t> filter_spatial_lengths,
+                           std::vector<ck::index_t> output_spatial_lengths,
+                           std::vector<ck::index_t> conv_filter_strides,
+                           std::vector<ck::index_t> conv_filter_dilations,
+                           std::vector<ck::index_t> input_left_pads,
+                           std::vector<ck::index_t> input_right_pads)
 {
    const ck::index_t Y = filter_spatial_lengths[0];
    const ck::index_t X = filter_spatial_lengths[1];
@@ -146,20 +132,30 @@ void profile_conv(int do_verification,
    // add device Conv instances
    std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;

-    ck::tensor_operation::device::device_conv_instance::add_device_conv_fwd_instance<2,
-                                                                                     InDataType,
-                                                                                     WeiDataType,
-                                                                                     OutDataType,
-                                                                                     InLayout,
-                                                                                     WeiLayout,
-                                                                                     OutLayout>(
-        conv_ptrs);
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+    {
+        ck::tensor_operation::device::device_conv2d_fwd_instance::
+            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+    }
+    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_fwd_instance::
+            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+
+        ck::tensor_operation::device::device_conv2d_fwd_instance::
+            add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+    }

    if(conv_ptrs.size() <= 0)
    {
        throw std::runtime_error("wrong! no device Conv instance found");
    }

+    std::string best_conv_name;
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
@@ -189,6 +185,8 @@ void profile_conv(int do_verification,

        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            std::string conv_name = conv_ptr->GetTypeString();
+
            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);

            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
@@ -202,10 +200,11 @@ void profile_conv(int do_verification,
            float gb_per_sec = num_btype / 1.E6 / ave_time;

            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s" << std::endl;
+                      << " GB/s, " << conv_name << std::endl;

            if(tflops > best_tflops)
            {
+                best_conv_name  = conv_name;
                best_tflops     = tflops;
                best_ave_time   = ave_time;
                best_gb_per_sec = gb_per_sec;
@@ -235,7 +234,7 @@ void profile_conv(int do_verification,
    }

    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s" << std::endl;
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
 }

 } // namespace profiler

--- a/profiler/include/profile_gemm.hpp
+++ b/profiler/include/profile_gemm.hpp
@@ -88,16 +88,16 @@ template <typename ADataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
-void profile_gemm(int do_verification,
-                  int init_method,
-                  bool do_log,
-                  int nrepeat,
-                  int M,
-                  int N,
-                  int K,
-                  int StrideA,
-                  int StrideB,
-                  int StrideC)
+void profile_gemm_impl(int do_verification,
+                       int init_method,
+                       bool do_log,
+                       int nrepeat,
+                       int M,
+                       int N,
+                       int K,
+                       int StrideA,
+                       int StrideB,
+                       int StrideC)
 {
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -164,6 +164,7 @@ void profile_gemm(int do_verification,
        throw std::runtime_error("wrong! no device GEMM instance found");
    }

+    std::string best_gemm_name;
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
@@ -189,9 +190,12 @@ void profile_gemm(int do_verification,

        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            std::string gemm_name = gemm_ptr->GetTypeString();
+
            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);

            std::size_t flop = std::size_t(2) * M * N * K;
+
            std::size_t num_btype =
                sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + sizeof(CDataType) * M * N;

@@ -200,10 +204,11 @@ void profile_gemm(int do_verification,
            float gb_per_sec = num_btype / 1.E6 / ave_time;

            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s" << std::endl;
+                      << " GB/s, " << gemm_name << std::endl;

            if(tflops > best_tflops)
            {
+                best_gemm_name  = gemm_name;
                best_tflops     = tflops;
                best_ave_time   = ave_time;
                best_gb_per_sec = gb_per_sec;
@@ -234,7 +239,7 @@ void profile_gemm(int do_verification,
    }

    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s" << std::endl;
+              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
 }

 } // namespace profiler

--- a/profiler/conv_profiler.cpp
+++ b/profiler/conv_profiler.cpp
@@ -4,7 +4,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
-#include "profile_conv.hpp"
+#include "profile_conv_fwd_impl.hpp"

 enum ConvDataType
 {
@@ -30,11 +30,11 @@ enum ConvOutputLayout
    NHWK, // 1
 };

-int conv_profiler(int argc, char* argv[])
+int profile_conv_fwd(int argc, char* argv[])
 {
    if(argc != 25)
    {
-        printf("arg1: tensor operation (conv: Convolution)\n");
+        printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
        printf("arg2: data type (0: fp32; 1: fp16)\n");
        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
@@ -83,13 +83,13 @@ int conv_profiler(int argc, char* argv[])
    if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
    {
-        ck::profiler::profile_conv<2,
-                                   float,
-                                   float,
-                                   float,
-                                   ck::tensor_layout::convolution::NHWC,
-                                   ck::tensor_layout::convolution::KYXC,
-                                   ck::tensor_layout::convolution::NHWK>(
+        ck::profiler::profile_conv_fwd_impl<2,
+                                            float,
+                                            float,
+                                            float,
+                                            ck::tensor_layout::convolution::NHWC,
+                                            ck::tensor_layout::convolution::KYXC,
+                                            ck::tensor_layout::convolution::NHWK>(
            do_verification,
            init_method,
            do_log,
@@ -108,13 +108,13 @@ int conv_profiler(int argc, char* argv[])
    else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
    {
-        ck::profiler::profile_conv<2,
-                                   ck::half_t,
-                                   ck::half_t,
-                                   ck::half_t,
-                                   ck::tensor_layout::convolution::NHWC,
-                                   ck::tensor_layout::convolution::KYXC,
-                                   ck::tensor_layout::convolution::NHWK>(
+        ck::profiler::profile_conv_fwd_impl<2,
+                                            ck::half_t,
+                                            ck::half_t,
+                                            ck::half_t,
+                                            ck::tensor_layout::convolution::NHWC,
+                                            ck::tensor_layout::convolution::KYXC,
+                                            ck::tensor_layout::convolution::NHWK>(
            do_verification,
            init_method,
            do_log,

--- a/profiler/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/profile_conv_fwd_bias_relu.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_fwd_bias_relu_impl.hpp"
+
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+int profile_conv_fwd_bias_relu(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd_bias_relu: ForwardConvolution+Bias+ReLu)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_impl<2,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::tensor_layout::convolution::NHWC,
+                                                      ck::tensor_layout::convolution::KYXC,
+                                                      ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+
+    return 1;
+}
--- a/profiler/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/profile_conv_fwd_bias_relu_add.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_fwd_bias_relu_add_impl.hpp"
+
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf(
+            "arg1: tensor operation (conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLu+Add)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_add_impl<2,
+                                                          ck::half_t,
+                                                          ck::half_t,
+                                                          ck::half_t,
+                                                          ck::tensor_layout::convolution::NHWC,
+                                                          ck::tensor_layout::convolution::KYXC,
+                                                          ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+
+    return 1;
+}
--- a/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_fwd_bias_relu_atomic_add_impl.hpp"
+
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd_bias_relu_atomic_add: "
+               "ForwardConvolution+Bias+ReLu+AtomicAdd)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_atomic_add_impl<
+            2,
+            ck::half_t,
+            ck::half_t,
+            ck::half_t,
+            ck::tensor_layout::convolution::NHWC,
+            ck::tensor_layout::convolution::KYXC,
+            ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+
+    return 1;
+}
--- a/profiler/profile_gemm.cpp
+++ b/profiler/profile_gemm.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_base.hpp"
+#include "device_gemm_xdl.hpp"
+#include "profile_gemm_impl.hpp"
+
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+
+enum GemmDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
+int profile_gemm(int argc, char* argv[])
+{
+    if(argc != 14)
+    {
+        printf("arg1: tensor operation (gemm: GEMM)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, n] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, n] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        exit(1);
+    }
+
+    const int data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const int layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const int nrepeat          = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<ck::half_t,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_impl<float,
+                                        float,
+                                        float,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor,
+                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
+    }
+
+    return 1;
+}
--- a/profiler/profiler.cpp
+++ b/profiler/profiler.cpp
@@ -5,22 +5,42 @@
 #include <stdlib.h>
 #include <half.hpp>

-int gemm_profiler(int, char*[]);
-int conv_profiler(int, char*[]);
+int profile_gemm(int, char*[]);
+int profile_conv_fwd(int, char*[]);
+int profile_conv_fwd_bias_relu(int, char*[]);
+int profile_conv_fwd_bias_relu_add(int, char*[]);
+int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);

 int main(int argc, char* argv[])
 {
    if(strcmp(argv[1], "gemm") == 0)
    {
-        return gemm_profiler(argc, argv);
+        return profile_gemm(argc, argv);
    }
-    else if(strcmp(argv[1], "conv") == 0)
+    else if(strcmp(argv[1], "conv_fwd") == 0)
    {
-        return conv_profiler(argc, argv);
+        return profile_conv_fwd(argc, argv);
+    }
+    else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
+    {
+        return profile_conv_fwd_bias_relu(argc, argv);
+    }
+    else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
+    {
+        return profile_conv_fwd_bias_relu_add(argc, argv);
+    }
+    else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0)
+    {
+        return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
    }
    else
    {
-        printf("arg1: tensor operation (gemm=GEMM, conv=Convolution)\n");
+        printf("arg1: tensor operation (gemm: GEMM;\n"
+               "                        conv_fwd: ForwardConvolution;\n"
+               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU)\n"
+               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add)\n"
+               "                        conv_fwd_bias_relu_atomic_add: "
+               "ForwardConvolution+Bias+ReLU+AtomicAdd)\n");
        return 0;
    }
 }