GEMM+Bias+ReLU+Add (#76)

* tweak conv for odd C * update script * clean up elementwise op * fix build * clean up * added example for gemm+bias+relu+add * added example for gemm+bias+relu * add profiler for gemm_s_shuffle; re-org files * add profiler * fix build * clean up * clean up * clean up * fix build

GEMM+Bias+ReLU+Add (#76)
* tweak conv for odd C * update script * clean up elementwise op * fix build * clean up * added example for gemm+bias+relu+add * added example for gemm+bias+relu * add profiler for gemm_s_shuffle; re-org files * add profiler * fix build * clean up * clean up * clean up * fix build
823657ed · Chao Liu · GitHub · 690c75a7 · 823657ed · 823657ed
Unverified Commit 823657ed authored Feb 06, 2022 by Chao Liu Committed by GitHub Feb 06, 2022
17 changed files
--- a/profiler/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/profile_conv_fwd_bias_relu.cpp
--- a/profiler/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/profile_conv_fwd_bias_relu_add.cpp
--- a/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
--- a/profiler/profile_gemm.cpp
+++ b/profiler/profile_gemm.cpp
@@ -4,15 +4,6 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_gemm.hpp"
-#include "device_tensor.hpp"
-#include "device_base.hpp"
-#include "device_gemm_xdl.hpp"
 #include "profile_gemm_impl.hpp"
 enum GemmMatrixLayout

--- a/profiler/src/profile_gemm_bias_relu.cpp
+++ b/profiler/src/profile_gemm_bias_relu.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_gemm_bias_relu_impl.hpp"
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+enum GemmDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+int profile_gemm_bias_relu(int argc, char* argv[])
+{
+    if(!(argc == 14 || argc == 15))
+    {
+        printf("arg1: tensor operation (gemm: GEMM+Bias+ReLU)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg14: split k into  mulitiple batch\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const int layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const int nrepeat          = std::stoi(argv[7]);
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+    int KBatch = 1;
+    if(argc == 15)
+        KBatch = std::stoi(argv[14]);
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_impl<ck::half_t,
+                                                  ck::half_t,
+                                                  ck::half_t,
+                                                  ck::tensor_layout::gemm::RowMajor,
+                                                  ck::tensor_layout::gemm::RowMajor,
+                                                  ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_impl<ck::half_t,
+                                                  ck::half_t,
+                                                  ck::half_t,
+                                                  ck::tensor_layout::gemm::RowMajor,
+                                                  ck::tensor_layout::gemm::ColumnMajor,
+                                                  ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_impl<ck::half_t,
+                                                  ck::half_t,
+                                                  ck::half_t,
+                                                  ck::tensor_layout::gemm::ColumnMajor,
+                                                  ck::tensor_layout::gemm::RowMajor,
+                                                  ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_impl<ck::half_t,
+                                                  ck::half_t,
+                                                  ck::half_t,
+                                                  ck::tensor_layout::gemm::ColumnMajor,
+                                                  ck::tensor_layout::gemm::ColumnMajor,
+                                                  ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+    return 1;
+}
--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_gemm_bias_relu_add_impl.hpp"
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+enum GemmDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+int profile_gemm_bias_relu_add(int argc, char* argv[])
+{
+    if(!(argc == 15 || argc == 16))
+    {
+        printf("arg1: tensor operation (gemm: GEMM+Bias+ReLU+Add)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
+        printf("arg15: split k into  mulitiple batch\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const int layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const int nrepeat          = std::stoi(argv[7]);
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideC  = std::stoi(argv[13]);
+    const int StrideC1 = std::stoi(argv[14]);
+    int KBatch = 1;
+    if(argc == 16)
+        KBatch = std::stoi(argv[15]);
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_add_impl<ck::half_t,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::tensor_layout::gemm::RowMajor,
+                                                      ck::tensor_layout::gemm::RowMajor,
+                                                      ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_add_impl<ck::half_t,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::tensor_layout::gemm::RowMajor,
+                                                      ck::tensor_layout::gemm::ColumnMajor,
+                                                      ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_add_impl<ck::half_t,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::tensor_layout::gemm::ColumnMajor,
+                                                      ck::tensor_layout::gemm::RowMajor,
+                                                      ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_relu_add_impl<ck::half_t,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::tensor_layout::gemm::ColumnMajor,
+                                                      ck::tensor_layout::gemm::ColumnMajor,
+                                                      ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+    return 1;
+}
--- a/profiler/profiler.cpp
+++ b/profiler/profiler.cpp
@@ -6,6 +6,8 @@
 #include <half.hpp>
 int profile_gemm(int, char*[]);
+int profile_gemm_bias_relu(int, char*[]);
+int profile_gemm_bias_relu_add(int, char*[]);
 int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
@@ -17,6 +19,14 @@ int main(int argc, char* argv[])
    {
        return profile_gemm(argc, argv);
    }
+    if(strcmp(argv[1], "gemm_bias_relu") == 0)
+    {
+        return profile_gemm_bias_relu(argc, argv);
+    }
+    if(strcmp(argv[1], "gemm_bias_relu_add") == 0)
+    {
+        return profile_gemm_bias_relu_add(argc, argv);
+    }
    else if(strcmp(argv[1], "conv_fwd") == 0)
    {
        return profile_conv_fwd(argc, argv);
@@ -35,12 +45,16 @@ int main(int argc, char* argv[])
    }
    else
    {
-        printf("arg1: tensor operation (gemm: GEMM;\n"
+        // clang-format off
-               "                        conv_fwd: ForwardConvolution;\n"
+        printf("arg1: tensor operation (gemm: GEMM\n"
-               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU)\n"
+               "                        gemm_bias_relu: GEMM+Bias+ReLU\n"
-               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add)\n"
+               "                        gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
-               "                        conv_fwd_bias_relu_atomic_add: "
+               "                        conv_fwd: ForwardConvolution\n"
-               "ForwardConvolution+Bias+ReLU+AtomicAdd)\n");
+               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
+               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
+               "                        conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n");
+        // clang-format on
        return 0;
    }
 }
--- a/host/include/reference_conv_fwd.hpp
+++ b/host/include/reference_conv_fwd.hpp
@@ -14,7 +14,6 @@ namespace host {
 template <typename InDataType,
          typename WeiDataType,
          typename OutDataType,
-          typename AccDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
@@ -68,7 +67,8 @@ struct ReferenceConvFwd : public device::BaseOperator
        float Run(const Argument& arg)
        {
            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-                float v = 0;
+                float v_acc = 0;
                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
                {
                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
@@ -82,17 +82,26 @@ struct ReferenceConvFwd : public device::BaseOperator
                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
                            {
-                                v += arg.in_element_op_(
+                                float v_in;
-                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                float v_wei;
-                                     arg.wei_element_op_(
-                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                                arg.in_element_op_(
+                                    v_in,
+                                    static_cast<const float>(arg.in_n_c_hi_wi_(n, c, hi, wi)));
+                                arg.wei_element_op_(
+                                    v_wei, static_cast<const float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                                v_acc += v_in * v_wei;
                            }
                        }
                    }
                }
-                arg.out_n_k_ho_wo_(n, k, ho, wo) =
+                float v_out;
-                    ck::type_convert<OutDataType>(arg.out_element_op_(v));
+                arg.out_element_op_(v_out, v_acc);
+                arg.out_n_k_ho_wo_(n, k, ho, wo) = v_out;
            };
            make_ParallelTensorFunctor(f_nchw,
@@ -101,6 +110,7 @@ struct ReferenceConvFwd : public device::BaseOperator
                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
                std::thread::hardware_concurrency());
            return 0;
        }
@@ -160,6 +170,7 @@ struct ReferenceConvFwd : public device::BaseOperator
        return str.str();
    }
 };
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck

--- a/host/include/reference_conv_fwd_bias_activation.hpp
+++ b/host/include/reference_conv_fwd_bias_activation.hpp
@@ -15,7 +15,6 @@ namespace host {
 template <typename InDataType,
          typename WeiDataType,
          typename OutDataType,
-          typename AccDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
@@ -72,7 +71,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
        float Run(const Argument& arg)
        {
            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-                float v = 0;
+                float v_acc = 0;
                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
                {
                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
@@ -86,17 +86,26 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
                            {
-                                v += arg.in_element_op_(
+                                float v_in;
-                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                float v_wei;
-                                     arg.wei_element_op_(
-                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                                arg.in_element_op_(
+                                    v_in,
+                                    static_cast<const float>(arg.in_n_c_hi_wi_(n, c, hi, wi)));
+                                arg.wei_element_op_(
+                                    v_wei, static_cast<const float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                                v_acc += v_in * v_wei;
                            }
                        }
                    }
                }
-                arg.out_n_k_ho_wo_(n, k, ho, wo) =
+                float v_out;
-                    ck::type_convert<OutDataType>(arg.out_element_op_(v, arg.bias_k_(k)));
+                arg.out_element_op_(v_out, v_acc, static_cast<float>(arg.bias_k_(k)));
+                arg.out_n_k_ho_wo_(n, k, ho, wo) = v_out;
            };
            make_ParallelTensorFunctor(f_nchw,
@@ -166,6 +175,7 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
        return str.str();
    }
 };
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck

--- a/host/include/reference_conv_fwd_bias_activation_add.hpp
+++ b/host/include/reference_conv_fwd_bias_activation_add.hpp
@@ -15,7 +15,6 @@ namespace host {
 template <typename InDataType,
          typename WeiDataType,
          typename OutDataType,
-          typename AccDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
@@ -75,7 +74,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
        float Run(const Argument& arg)
        {
            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-                float v = 0;
+                float v_acc = 0;
                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
                {
                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
@@ -89,23 +89,29 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
                            {
-                                v += arg.in_element_op_(
+                                float v_in;
-                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                float v_wei;
-                                     arg.wei_element_op_(
-                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                                arg.in_element_op_(
+                                    v_in,
+                                    static_cast<const float>(arg.in_n_c_hi_wi_(n, c, hi, wi)));
+                                arg.wei_element_op_(
+                                    v_wei, static_cast<const float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                                v_acc += v_in * v_wei;
                            }
                        }
                    }
                }
-                float v2 = ck::type_convert<float>(arg.out_n_k_ho_wo_(n, k, ho, wo));
+                float v_out;
-                arg.out_element_op_(v2,
+                arg.out_element_op_(v_out,
-                                    v,
+                                    v_acc,
-                                    ck::type_convert<float>(arg.bias_k_(k)),
+                                    static_cast<const float>(arg.bias_k_(k)),
-                                    ck::type_convert<float>(arg.resi_n_k_ho_wo_(n, k, ho, wo)));
+                                    static_cast<const float>(arg.resi_n_k_ho_wo_(n, k, ho, wo)));
-                arg.out_n_k_ho_wo_(n, k, ho, wo) = ck::type_convert<OutDataType>(v2);
+                arg.out_n_k_ho_wo_(n, k, ho, wo) = v_out;
            };
            make_ParallelTensorFunctor(f_nchw,
@@ -177,6 +183,7 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
        return str.str();
    }
 };
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck

--- a/reference_operation/include/reference_gemm.hpp
+++ b/reference_operation/include/reference_gemm.hpp
--- a/reference_operation/include/reference_gemm_bias_activation.hpp
+++ b/reference_operation/include/reference_gemm_bias_activation.hpp
--- a/reference_operation/include/reference_gemm_bias_activation_add.hpp
+++ b/reference_operation/include/reference_gemm_bias_activation_add.hpp
--- a/script/conv2d_fwd.sh
+++ b/script/conv2d_fwd.sh
--- a/script/gemm.sh
+++ b/script/gemm.sh
--- a/script/pool2d_fwd.sh
+++ b/script/pool2d_fwd.sh
--- a/script/profile_conv.sh
+++ b/script/profile_conv.sh