added profiler for conv+bias+relu+add

96d1a7a9 · Chao Liu · cd929111 · 96d1a7a9 · 96d1a7a9
Commit 96d1a7a9 authored Dec 05, 2021 by Chao Liu
2 changed files
--- a/example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
+++ b/example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
@@ -12,17 +12,9 @@
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
 #include "device_conv2d_fwd_xdl_bias_activation_add_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
-struct PassThrough
+struct AddLeakyReluAdd
-{
-    template <typename T>
-    __host__ __device__ constexpr T operator()(T v) const
-    {
-        return v;
-    }
-};
-struct BiasLeakyReluAdd
 {
    template <typename T1, typename T2>
    __host__ constexpr float operator()(float v0, T1 v1, T2 v2) const
@@ -96,61 +88,6 @@ struct BiasLeakyReluAdd
    }
 };
-struct BiasReluAdd
-{
-    template <typename T1, typename T2>
-    __host__ constexpr float operator()(float v0, T1 v1, T2 v2) const
-    {
-        float b = v0 + v1;
-        float c = b > 0 ? b : 0;
-        float d = c + v2;
-        return d;
-    }
-    template <typename T1, typename T2>
-    __device__ constexpr float operator()(float v0, T1 v1, T2 v2) const
-    {
-#if 0
-        float a = v1 + v0;
-        float b = max(a, float(0));
-        float c = b + v2;
-        return c;
-#else
-        float b = v1 + v2;
-        float c = (v0 > -v1) ? b + v0 : v2;
-        return c;
-#endif
-    }
-};
-struct BiasLeakyRelu
-{
-    template <typename T1, typename T2>
-    __host__ constexpr float operator()(float v0, T1 v1, T2) const
-    {
-        float a = v0 + v1;
-        float b = 0.1 * a;
-        float c = b > 0 ? b : 0;
-        return c;
-    }
-    template <typename T1, typename T2>
-    __device__ constexpr float operator()(float v0, T1 v1, T2) const
-    {
-        constexpr float alpha = 0.1;
-        float b = v1 + v0;
-        float c = max(b, float(0));
-        float d = alpha * c;
-        return d;
-    }
-};
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;
@@ -163,18 +100,18 @@ using InLayout  = ck::tensor_layout::convolution::NHWC;
 using WeiLayout = ck::tensor_layout::convolution::KYXC;
 using OutLayout = ck::tensor_layout::convolution::NHWK;
-using InElementOp  = PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = BiasReluAdd;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+// clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
    DeviceConv2dFwdXdl_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
+//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
-//  |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
+//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
-//  |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
+//      |          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
-//  |          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
+//      |          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
-//  |          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
+        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
-    <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
 // clang-format on
 template <typename TIn,
@@ -191,7 +128,7 @@ void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
                                const std::vector<ck::index_t>& conv_strides,
                                const std::vector<ck::index_t>& conv_dilations,
                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>&,
+                                const std::vector<ck::index_t>& /* in_right_pads */,
                                const InElementOp& in_element_op,
                                const WeiElementOp& wei_element_op,
                                const OutElementOp& out_element_op)
@@ -356,8 +293,8 @@ int main(int argc, char* argv[])
    default:
        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
-        resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
@@ -397,8 +334,8 @@ int main(int argc, char* argv[])
    if(!conv.IsSupportedArgument(argument))
    {
        throw std::runtime_error(
-            "wrong! device_conv with the specified compilation parameters does "
+            "wrong! device operator with the specified compilation parameters does "
-            "not support this Conv problem");
+            "not support this problem");
    }
    float ave_time = invoker.Run(argument, nrepeat);

--- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -6,21 +6,23 @@
 #include "host_conv.hpp"
 #include "tensor_layout.hpp"
 #include "device_tensor.hpp"
-#include "device_conv_fwd.hpp"
+#include "device_conv_fwd_bias_activation_add.hpp"
 #include "element_wise_operation.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_conv2d_fwd_instance {
+namespace device_conv2d_fwd_bias_activation_add_instance {
-using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
+using DeviceConvFwdBiasReluAddPtr =
-                                              ck::tensor_operation::element_wise::PassThrough,
+    DeviceConvFwdBiasActivationAddPtr<ck::tensor_operation::element_wise::PassThrough,
-                                              ck::tensor_operation::element_wise::PassThrough>;
+                                      ck::tensor_operation::element_wise::PassThrough,
+                                      ck::tensor_operation::element_wise::AddReluAdd>;
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
+void add_device_conv2d_fwd_bias_relu_add_xdl_nhwc_kyxc_nhwk_fp16_instances(
+    std::vector<DeviceConvFwdBiasReluAddPtr>&);
-} // namespace device_conv2d_fwd_instance
+} // namespace device_conv2d_fwd_bias_activation_add_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
@@ -28,6 +30,56 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(std::vector<DeviceC
 namespace ck {
 namespace profiler {
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp>
+void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
+                                const Tensor<TWei>& wei_k_c_y_x,
+                                Tensor<TOut>& out_n_k_ho_wo,
+                                const Tensor<TOut>& bias_k,
+                                const Tensor<TOut>& resi_n_k_ho_wo,
+                                const std::vector<ck::index_t>& conv_strides,
+                                const std::vector<ck::index_t>& conv_dilations,
+                                const std::vector<ck::index_t>& in_left_pads,
+                                const std::vector<ck::index_t>& /* in_right_pads */,
+                                const InElementOp& in_element_op,
+                                const WeiElementOp& wei_element_op,
+                                const OutElementOp& out_element_op)
+{
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
+                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
+                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
+                    {
+                        v += in_element_op(static_cast<const double>(in_n_c_hi_wi(n, c, hi, wi))) *
+                             wei_element_op(static_cast<const double>(wei_k_c_y_x(k, c, y, x)));
+                    }
+                }
+            }
+        }
+        out_n_k_ho_wo(n, k, ho, wo) = out_element_op(v, bias_k(k), resi_n_k_ho_wo(n, k, ho, wo));
+    };
+    make_ParallelTensorFunctor(f_nchw,
+                               out_n_k_ho_wo.mDesc.GetLengths()[0],
+                               out_n_k_ho_wo.mDesc.GetLengths()[1],
+                               out_n_k_ho_wo.mDesc.GetLengths()[2],
+                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
+        std::thread::hardware_concurrency());
+}
 template <int NDimSpatial,
          typename InDataType,
          typename WeiDataType,
@@ -35,20 +87,20 @@ template <int NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename OutLayout>
-void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
+void profile_conv_fwd_bias_relu_add_impl(int do_verification,
-                                          int init_method,
+                                         int init_method,
-                                          bool do_log,
+                                         bool do_log,
-                                          int nrepeat,
+                                         int nrepeat,
-                                          ck::index_t N,
+                                         ck::index_t N,
-                                          ck::index_t K,
+                                         ck::index_t K,
-                                          ck::index_t C,
+                                         ck::index_t C,
-                                          std::vector<ck::index_t> input_spatial_lengths,
+                                         std::vector<ck::index_t> input_spatial_lengths,
-                                          std::vector<ck::index_t> filter_spatial_lengths,
+                                         std::vector<ck::index_t> filter_spatial_lengths,
-                                          std::vector<ck::index_t> output_spatial_lengths,
+                                         std::vector<ck::index_t> output_spatial_lengths,
-                                          std::vector<ck::index_t> conv_filter_strides,
+                                         std::vector<ck::index_t> conv_filter_strides,
-                                          std::vector<ck::index_t> conv_filter_dilations,
+                                         std::vector<ck::index_t> conv_filter_dilations,
-                                          std::vector<ck::index_t> input_left_pads,
+                                         std::vector<ck::index_t> input_left_pads,
-                                          std::vector<ck::index_t> input_right_pads)
+                                         std::vector<ck::index_t> input_right_pads)
 {
    const ck::index_t Y = filter_spatial_lengths[0];
    const ck::index_t X = filter_spatial_lengths[1];
@@ -84,9 +136,18 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
    Tensor<OutDataType> out_n_k_ho_wo_device_result(
        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    // bias: assume contiguous 1d vector
+    Tensor<OutDataType> bias_k(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
+    // residual: assume same layout as output tensor
+    Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+    std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl;
    switch(init_method)
    {
@@ -94,61 +155,63 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
    case 1:
        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
        break;
    default:
        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
    }
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
    if(do_verification)
    {
-        host_conv_nchw_kcyx_nkhw(in_n_c_hi_wi,
+        host_reference_calculation(in_n_c_hi_wi,
-                                 wei_k_c_y_x,
+                                   wei_k_c_y_x,
-                                 out_n_k_ho_wo_host_result,
+                                   out_n_k_ho_wo_host_result,
-                                 conv_filter_strides,
+                                   bias_k,
-                                 conv_filter_dilations,
+                                   resi_n_k_ho_wo,
-                                 input_left_pads,
+                                   conv_filter_strides,
-                                 input_right_pads);
+                                   conv_filter_dilations,
+                                   input_left_pads,
+                                   input_right_pads,
+                                   InElementOp{},
+                                   WeiElementOp{},
+                                   OutElementOp{});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
    DeviceMem out_device_buf(sizeof(OutDataType) *
                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
+    DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpace());
    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());
+    resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data());
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using DeviceConvFwdBiasReluAddPtr = ck::tensor_operation::device::
+        DeviceConvFwdBiasActivationAddPtr<InElementOp, WeiElementOp, OutElementOp>;
-    using DeviceConvFwdNoOpPtr =
-        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
-    // add device Conv instances
+    // add device operator instances
-    std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+    std::vector<DeviceConvFwdBiasReluAddPtr> op_ptrs;
-    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
-                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
-                 ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
    {
-        ck::tensor_operation::device::device_conv2d_fwd_instance::
+        ck::tensor_operation::device::device_conv2d_fwd_bias_activation_add_instance::
-            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp32_instances(conv_ptrs);
+            add_device_conv2d_fwd_bias_relu_add_xdl_nhwc_kyxc_nhwk_fp16_instances(op_ptrs);
-    }
-    else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
-                      ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
-                      ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
-    {
-        ck::tensor_operation::device::device_conv2d_fwd_instance::
-            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_fp16_instances(conv_ptrs);
-        ck::tensor_operation::device::device_conv2d_fwd_instance::
-            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_fp16_instances(conv_ptrs);
-        ck::tensor_operation::device::device_conv2d_fwd_instance::
-            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_fp16_instances(conv_ptrs);
    }
-    if(conv_ptrs.size() <= 0)
+    if(op_ptrs.size() <= 0)
    {
        throw std::runtime_error("wrong! no device Conv instance found");
    }
@@ -159,12 +222,14 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
    float best_gb_per_sec = 0;
    // profile device Conv instances
-    for(auto& conv_ptr : conv_ptrs)
+    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
+            static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()),
            N,
            K,
            C,
@@ -175,23 +240,24 @@ void profile_conv_fwd_bias_relu_bias_impl(int do_verification,
            conv_filter_dilations,
            input_left_pads,
            input_right_pads,
-            PassThrough{},
+            InElementOp{},
-            PassThrough{},
+            WeiElementOp{},
-            PassThrough{});
+            OutElementOp{});
-        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
-            std::string conv_name = conv_ptr->GetTypeString();
+            std::string conv_name = op_ptr->GetTypeString();
            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
-            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+            std::size_t num_btype =
-                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
-                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+                sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K) +
+                sizeof(OutDataType) * (N * K * Ho * Wo);
            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;