Merge branch 'develop' into wavelet_model

7bcaf2a7 · Adam Osewski · GitHub · e59daa22 · 0345963e · 7bcaf2a7
Unverified Commit 7bcaf2a7 authored Dec 19, 2022 by Adam Osewski Committed by GitHub Dec 19, 2022
20 changed files
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp"
+#include "profiler/profile_conv_fwd_bias_relu_add_impl.hpp"
+#include "profiler_operation_registry.hpp"

 enum struct ConvDataType
 {
@@ -32,12 +33,14 @@ enum struct ConvOutputLayout
    NHWK, // 1
 };

+#define OP_NAME "conv_fwd_bias_relu_add"
+#define OP_DESC "Convolution Forward+Bias+ReLU+Add"
+
 int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
 {
    if(argc != 25)
    {
-        printf(
-            "arg1: tensor operation (conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLu+Add)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16)\n");
        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
@@ -115,3 +118,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd_bias_relu_add);
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_gemm_impl.hpp"
+#include "profiler/profile_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"

 enum struct GemmMatrixLayout
 {
@@ -24,9 +25,12 @@ enum struct GemmDataType
    INT8_INT8_INT8, // 3
 };

+#define OP_NAME "gemm"
+#define OP_DESC "GEMM"
+
 static void print_helper_msg()
 {
-    std::cout << "arg1: tensor operation (gemm: GEMM)\n"
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
              << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
              << "                     1: A[m, k] * B[n, k] = C[m, n];\n"
@@ -184,3 +188,5 @@ int profile_gemm(int argc, char* argv[])
        return 1;
    }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm);
--- a/profiler/src/profile_gemm_add_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_gemm_add_add_fastgelu_impl.hpp"
+#include "profiler/profile_gemm_add_add_fastgelu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_add_add_fastgelu"
+#define OP_DESC "GEMM+Add+Add+FastGeLU"

 int profile_gemm_add_add_fastgelu(int argc, char* argv[])
 {
@@ -29,7 +33,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
    if(argc != 16)
    {
        // clang-format off
-        printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");
@@ -150,3 +154,5 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
        return 1;
    }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_add_fastgelu);
--- a/profiler/src/profile_gemm_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_fastgelu.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_add_fastgelu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_add_fastgelu"
+#define OP_DESC "GEMM+Add+FastGeLU"
+
+int profile_gemm_add_fastgelu(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN_MN, // 0
+        MK_NK_MN_MN, // 1
+        KM_KN_MN_MN, // 2
+        KM_NK_MN_MN, // 3
+    };
+
+    enum struct MatrixDataType
+    {
+        F32_F32_F32_F32,     // 0
+        F16_F16_F16_F16,     // 1
+        BF16_BF16_BF16_BF16, // 2
+        INT8_INT8_INT8_INT8, // 3
+    };
+
+    if(argc != 15)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n]);\n");
+        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n]);\n");
+        printf("                     2: E[m, n] = FastGeLU(A[k, m] * B[k, n] + D0[m, n]);\n");
+        printf("                     3: E[m, n] = FastGeLU(A[k, m] * B[n, k] + D0[m, n]))\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<MatrixDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideD0 = std::stoi(argv[13]);
+    const int StrideE  = std::stoi(argv[14]);
+
+    using F16 = ck::half_t;
+    using F32 = float;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto d0_type,
+                       auto e_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto d0_layout,
+                       auto e_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using D0DataType  = decltype(d0_type);
+        using EDataType   = decltype(e_type);
+
+        using ALayout  = decltype(a_layout);
+        using BLayout  = decltype(b_layout);
+        using D0Layout = decltype(d0_layout);
+        using ELayout  = decltype(e_layout);
+
+        const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+        const int DefaultStrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_add_fastgelu_impl<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 D0DataType,
+                                                                 EDataType,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 D0Layout,
+                                                                 ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
+            (StrideE < 0) ? DefaultStrideE : StrideE);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_fastgelu);
--- a/profiler/src/profile_gemm_bias_add_reduce.cpp
+++ b/profiler/src/profile_gemm_bias_add_reduce.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_gemm_bias_add_reduce_impl.hpp"
+#include "profiler/profile_gemm_bias_add_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_bias_add_reduce"
+#define OP_DESC "GEMM+Bias+Add+Reduce"

 int profile_gemm_bias_add_reduce(int argc, char* argv[])
 {
@@ -26,7 +30,7 @@ int profile_gemm_bias_add_reduce(int argc, char* argv[])

    if(!(argc == 14 || argc == 15))
    {
-        printf("arg1: tensor operation (gemm: GEMM+bias+add+Reduce)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16)\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -159,3 +163,5 @@ int profile_gemm_bias_add_reduce(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bias_add_reduce);
--- a/profiler/src/profile_gemm_bilinear.cpp
+++ b/profiler/src/profile_gemm_bilinear.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_gemm_bilinear_impl.hpp"
+#include "profiler/profile_gemm_bilinear_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_bilinear"
+#define OP_DESC "GEMM+Bilinear"

 int profile_gemm_bilinear(int argc, char* argv[])
 {
@@ -29,7 +33,7 @@ int profile_gemm_bilinear(int argc, char* argv[])
    if(argc != 17)
    {
        // clang-format off
-        printf("arg1: tensor operation (gemm_bilinear: GEMM+Bilinear)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
        printf("arg3: matrix layout (0: E[m, n] = alpha * A[m, k] * B[k, n] + beta * D[m, n];\n");
        printf("                     1: E[m, n] = alpha * A[m, k] * B[n, k] + beta * D[m, n];\n");
@@ -144,3 +148,5 @@ int profile_gemm_bilinear(int argc, char* argv[])
        return 1;
    }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bilinear);
--- a/profiler/src/profile_gemm_fastgelu.cpp
+++ b/profiler/src/profile_gemm_fastgelu.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_fastgelu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_fastgelu"
+#define OP_DESC "GEMM+FastGeLU"
+
+int profile_gemm_fastgelu(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN, // 0
+        MK_NK_MN, // 1
+        KM_KN_MN, // 2
+        KM_NK_MN, // 3
+    };
+
+    enum struct MatrixDataType
+    {
+        F32_F32_F32,    // 0
+        F16_F16_F16,    // 1
+        BF16_BF16_BF16, // 2
+        INT8_INT8_INT8, // 3
+    };
+
+    if(argc != 14)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n]);\n");
+        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k]);\n");
+        printf("                     2: E[m, n] = FastGeLU(A[k, m] * B[k, n]);\n");
+        printf("                     3: E[m, n] = FastGeLU(A[k, m] * B[n, k]))\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideE\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<MatrixDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideE = std::stoi(argv[13]);
+
+    using F16 = ck::half_t;
+    using F32 = float;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto e_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto e_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using EDataType   = decltype(e_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using ELayout = decltype(e_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_fastgelu_impl<ADataType,
+                                                             BDataType,
+                                                             AccDataType,
+                                                             EDataType,
+                                                             ALayout,
+                                                             BLayout,
+                                                             ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideE < 0) ? DefaultStrideE : StrideE);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_fastgelu);
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_gemm_reduce_impl.hpp"
+#include "profiler/profile_gemm_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_reduce"
+#define OP_DESC "GEMM+Reduce"

 int profile_gemm_reduce(int argc, char* argv[])
 {
@@ -26,7 +30,7 @@ int profile_gemm_reduce(int argc, char* argv[])

    if(!(argc == 14 || argc == 15))
    {
-        printf("arg1: tensor operation (gemm: GEMM+Reduce)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16)\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -146,3 +150,5 @@ int profile_gemm_reduce(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_reduce);
--- a/profiler/src/profile_gemm_splitk.cpp
+++ b/profiler/src/profile_gemm_splitk.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_gemm_splitk_impl.hpp"
+#include "profiler/profile_gemm_splitk_impl.hpp"
+#include "profiler_operation_registry.hpp"

 enum struct GemmMatrixLayout
 {
@@ -24,11 +25,14 @@ enum struct GemmDataType
    INT8_INT8_INT8, // 3
 };

+#define OP_NAME "gemm_splitk"
+#define OP_DESC "Split-K GEMM"
+
 int profile_gemm_splitk(int argc, char* argv[])
 {
    if(argc != 15)
    {
-        printf("arg1: tensor operation (gemm_splitk: Split-K GEMM)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -146,3 +150,5 @@ int profile_gemm_splitk(int argc, char* argv[])
        return 1;
    }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_splitk);
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -6,7 +6,8 @@
 #include <iostream>
 #include <numeric>

-#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler_operation_registry.hpp"

 namespace {

@@ -23,9 +24,12 @@ enum struct ConvDataType
    BF16_F32_BF16, // 2
 };

+#define OP_NAME "grouped_conv_bwd_weight"
+#define OP_DESC "Grouped Convolution Backward Weight"
+
 static void print_helper_msg()
 {
-    std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
              << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
              << "                 1: Input fp16, Weight fp16, Output fp16\n"
              << "                 2: Input bf16, Weight fp32, Output bf16)\n"
@@ -174,3 +178,5 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])

    return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_weight);
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"

 namespace {

@@ -24,11 +25,14 @@ enum struct ConvDataType
    INT8_INT8_INT8, // 3
 };

+#define OP_NAME "grouped_conv_fwd"
+#define OP_DESC "Grouped Convolution Forward"
+
 static void print_helper_msg()
 {
    std::cout
        // clang-format off
-        << "arg1: tensor operation (grouped_conv_fwd: Grouped Convolution Forward)\n"
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
        << "                 1: Input fp16, Weight fp16, Output fp16\n"
        << "                 2: Input bf16, Weight bf16, Output bf16\n"
@@ -252,3 +256,5 @@ int profile_grouped_conv_fwd(int argc, char* argv[])

    return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_fwd);
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_grouped_gemm_impl.hpp"
+#include "profiler/profile_grouped_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"

 enum struct GemmMatrixLayout
 {
@@ -44,11 +45,14 @@ std::vector<int> argToIntArray(char* input)
    return out;
 }

+#define OP_NAME "grouped_gemm"
+#define OP_DESC "Grouped GEMM"
+
 int profile_grouped_gemm(int argc, char* argv[])
 {
    if(!(argc == 14))
    {
-        printf("arg1: tensor operation (grouped_gemm: Grouped GEMM)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -161,3 +165,5 @@ int profile_grouped_gemm(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm);
--- a/profiler/src/profile_groupnorm.cpp
+++ b/profiler/src/profile_groupnorm.cpp
@@ -5,8 +5,9 @@
 #include <vector>
 #include <unordered_map>

-#include "profiler/include/data_type_enum.hpp"
-#include "profiler/include/profile_groupnorm_impl.hpp"
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_groupnorm_impl.hpp"
+#include "profiler_operation_registry.hpp"

 using ck::index_t;

@@ -43,9 +44,12 @@ struct GroupnormArgParser
    }
 };

+#define OP_NAME "groupnorm"
+#define OP_DESC "Group Normalization"
+
 void print_help_groupnorm()
 {
-    std::cout << "arg1: tensor operation (groupnorm: Group normalization)\n"
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
              << "arg2: data type (0: fp16; 1: fp32)\n"
              << "arg3: verification (0: no; 1: yes)\n"
              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
@@ -104,3 +108,5 @@ int profile_groupnorm(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_groupnorm);
--- a/profiler/src/profile_layernorm.cpp
+++ b/profiler/src/profile_layernorm.cpp
@@ -5,8 +5,9 @@
 #include <vector>
 #include <unordered_map>

-#include "profiler/include/data_type_enum.hpp"
-#include "profiler/include/profile_layernorm_impl.hpp"
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_layernorm_impl.hpp"
+#include "profiler_operation_registry.hpp"

 using ck::index_t;

@@ -96,3 +97,5 @@ int profile_layernorm(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION("layernorm", "Layer Normalization", profile_layernorm);
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -13,8 +13,9 @@

 #include "ck/library/utility/host_common_util.hpp"

-#include "profiler/include/profile_reduce_impl.hpp"
-#include "profiler/include/data_type_enum.hpp"
+#include "profiler/profile_reduce_impl.hpp"
+#include "profiler/data_type_enum.hpp"
+#include "profiler_operation_registry.hpp"

 using namespace std;

@@ -429,3 +430,5 @@ int profile_reduce(int argc, char* argv[])

    return (0);
 };
+
+REGISTER_PROFILER_OPERATION("reduce", "Reduce", profile_reduce);
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -5,7 +5,8 @@
 #include <vector>
 #include <unordered_map>

-#include "profiler/include/profile_softmax_impl.hpp"
+#include "profiler/profile_softmax_impl.hpp"
+#include "profiler_operation_registry.hpp"

 using ck::index_t;
 using ck::profiler::SoftmaxDataType;
@@ -164,3 +165,5 @@ int profile_softmax(int argc, char* argv[])
 //     profile_normalization(argc, argv);
 //     return 0;
 // }
+
+REGISTER_PROFILER_OPERATION("softmax", "Softmax", profile_softmax);
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include <cstring>
+#include <cstdlib>
+#include <iostream>

-int profile_gemm(int, char*[]);
-int profile_gemm_splitk(int, char*[]);
-int profile_gemm_bilinear(int, char*[]);
-int profile_gemm_add_add_fastgelu(int, char*[]);
-int profile_gemm_reduce(int, char*[]);
-int profile_gemm_bias_add_reduce(int, char*[]);
-int profile_batched_gemm(int, char*[]);
-int profile_batched_gemm_gemm(int, char*[]);
-int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
-int profile_batched_gemm_reduce(int, char*[]);
-int profile_grouped_gemm(int, char*[]);
-int profile_conv_fwd(int, char*[]);
-int profile_conv_fwd_bias_relu(int, char*[]);
-int profile_conv_fwd_bias_relu_add(int, char*[]);
-int profile_conv_bwd_data(int, char*[]);
-int profile_grouped_conv_fwd(int, char*[]);
-int profile_grouped_conv_bwd_weight(int, char*[]);
-int profile_softmax(int, char*[]);
-int profile_layernorm(int, char*[]);
-int profile_groupnorm(int, char*[]);
-int profile_reduce(int, char*[]);
-int profile_batchnorm_forward(int, char*[]);
+#include "profiler_operation_registry.hpp"

 static void print_helper_message()
 {
-    // clang-format off
-    printf("arg1: tensor operation (gemm: GEMM\n"
-           "                        gemm_splitk: Split-K GEMM\n"
-           "                        gemm_bilinear: GEMM+Bilinear\n"
-           "                        gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU\n"
-           "                        gemm_reduce: GEMM+Reduce\n"
-           "                        gemm_bias_add_reduce: GEMM+Bias+Add+Reduce\n"
-           "                        batched_gemm: Batched GEMM\n"
-           "                        batched_gemm_gemm: Batched+GEMM+GEMM\n"
-           "                        batched_gemm_add_relu_gemm_add: Batched+GEMM+bias+gelu+GEMM+bias\n"
-           "                        batched_gemm_reduce: Batched GEMM+Reduce\n"
-           "                        grouped_gemm: Grouped GEMM\n"
-           "                        conv_fwd: Convolution Forward\n"
-           "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
-           "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
-           "                        conv_bwd_data: Convolution Backward Data\n"
-           "                        grouped_conv_fwd: Grouped Convolution Forward\n"
-           "                        grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
-           "                        softmax: Softmax\n"
-           "                        reduce: Reduce\n"
-	   "                        bnorm_fwd: Batchnorm forward\n");
-    // clang-format on
+    std::cout << "arg1: tensor operation " << ProfilerOperationRegistry::GetInstance() << std::endl;
 }

 int main(int argc, char* argv[])
@@ -57,101 +16,15 @@ int main(int argc, char* argv[])
    if(argc == 1)
    {
        print_helper_message();
-
-        return 0;
-    }
-    else if(strcmp(argv[1], "gemm") == 0)
-    {
-        return profile_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_splitk") == 0)
-    {
-        return profile_gemm_splitk(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_bilinear") == 0)
-    {
-        return profile_gemm_bilinear(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
-    {
-        return profile_gemm_add_add_fastgelu(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_reduce") == 0)
-    {
-        return profile_gemm_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_bias_add_reduce") == 0)
-    {
-        return profile_gemm_bias_add_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm") == 0)
-    {
-        return profile_batched_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_gemm") == 0)
-    {
-        return profile_batched_gemm_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_add_relu_gemm_add") == 0)
-    {
-        return profile_batched_gemm_add_relu_gemm_add(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_reduce") == 0)
-    {
-        return profile_batched_gemm_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "grouped_gemm") == 0)
-    {
-        return profile_grouped_gemm(argc, argv);
    }
-    else if(strcmp(argv[1], "conv_fwd") == 0)
+    else if(const auto operation = ProfilerOperationRegistry::GetInstance().Get(argv[1]);
+            operation.has_value())
    {
-        return profile_conv_fwd(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
-    {
-        return profile_conv_fwd_bias_relu(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
-    {
-        return profile_conv_fwd_bias_relu_add(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_bwd_data") == 0)
-    {
-        return profile_conv_bwd_data(argc, argv);
-    }
-    else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
-    {
-        return profile_grouped_conv_fwd(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
-    {
-        return profile_grouped_conv_bwd_weight(argc, argv);
-    }
-    else if(strcmp(argv[1], "reduce") == 0)
-    {
-        return profile_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "softmax") == 0)
-    {
-        return profile_softmax(argc, argv);
-    }
-    else if(strcmp(argv[1], "layernorm") == 0)
-    {
-        return profile_layernorm(argc, argv);
-    }
-    else if(strcmp(argv[1], "groupnorm") == 0)
-    {
-        return profile_groupnorm(argc, argv);
-    }
-    else if(strcmp(argv[1], "bnorm_fwd") == 0)
-    {
-        return profile_batchnorm_forward(argc, argv);
+        return (*operation)(argc, argv);
    }
    else
    {
-        print_helper_message();
-
-        return 0;
+        std::cerr << "cannot find operation: " << argv[1] << std::endl;
+        return EXIT_FAILURE;
    }
 }
--- a/profiler/src/profiler_operation_registry.hpp
+++ b/profiler/src/profiler_operation_registry.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <string_view>
+#include <utility>
+
+class ProfilerOperationRegistry final
+{
+    ProfilerOperationRegistry()  = default;
+    ~ProfilerOperationRegistry() = default;
+
+    public:
+    using Operation = std::function<int(int, char*[])>;
+
+    private:
+    struct Entry final
+    {
+        explicit Entry(std::string_view description, Operation operation) noexcept
+            : description_(description), operation_(std::move(operation))
+        {
+        }
+
+        std::string_view description_;
+        Operation operation_;
+    };
+
+    std::map<std::string_view, Entry> entries_;
+
+    friend std::ostream& operator<<(std::ostream& stream, const ProfilerOperationRegistry& registry)
+    {
+        stream << "{\n";
+        for(auto& [name, entry] : registry.entries_)
+        {
+            stream << "\t" << name << ": " << entry.description_ << "\n";
+        }
+        stream << "}";
+
+        return stream;
+    }
+
+    public:
+    static ProfilerOperationRegistry& GetInstance()
+    {
+        static ProfilerOperationRegistry registry;
+        return registry;
+    }
+
+    std::optional<Operation> Get(std::string_view name) const
+    {
+        const auto found = entries_.find(name);
+        if(found == end(entries_))
+        {
+            return std::nullopt;
+        }
+
+        return (found->second).operation_;
+    }
+
+    bool Add(std::string_view name, std::string_view description, Operation operation)
+    {
+        return entries_
+            .emplace(std::piecewise_construct,
+                     std::forward_as_tuple(name),
+                     std::forward_as_tuple(description, std::move(operation)))
+            .second;
+    }
+};
+
+#define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y)
+#define PP_CONCAT_IMPL(x, y) x##y
+
+#define REGISTER_PROFILER_OPERATION(name, description, operation)              \
+    static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
+        ::ProfilerOperationRegistry::GetInstance().Add(name, description, operation)
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/
+    ${PROJECT_SOURCE_DIR}/profiler/include
 )

 include(googletest)
@@ -53,4 +54,7 @@ add_subdirectory(softmax)
 add_subdirectory(normalization)
 add_subdirectory(data_type)
 add_subdirectory(elementwise_normalization)
-add_subdirectory(batchnorm_fwd)
+add_subdirectory(batchnorm)
+if(GPU_TARGETS MATCHES "gfx1100")
+    add_subdirectory(wmma_op)
+endif()
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"

 namespace {
 using ADataType = ck::bhalf_t;