Modularize ckProfiler operations (#514)

* Re-structure ckProfiler source files * Rename profiler.cpp to main.cpp * Modularize ckProfiler operations * Add description for profiler operations * Use longer name to avoid name collision * Use macro to delay expansion * Use std::move() to avoid object copying * Prohibit users from calling dtor * Use macro to eliminate redundant code * Make friend function hidden * Add missing include directive <iostream> * Fix wrong include directives * Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>

Modularize ckProfiler operations (#514)
* Re-structure ckProfiler source files * Rename profiler.cpp to main.cpp * Modularize ckProfiler operations * Add description for profiler operations * Use longer name to avoid name collision * Use macro to delay expansion * Use std::move() to avoid object copying * Prohibit users from calling dtor * Use macro to eliminate redundant code * Make friend function hidden * Add missing include directive <iostream> * Fix wrong include directives * Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
8784a72e · Po Yen Chen · GitHub · ad541ad6 · 8784a72e · 8784a72e
Unverified Commit 8784a72e authored Dec 02, 2022 by Po Yen Chen Committed by GitHub Dec 01, 2022
20 changed files
--- a/profiler/include/profile_gemm_bilinear_impl.hpp
+++ b/profiler/include/profile_gemm_bilinear_impl.hpp
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
--- a/profiler/include/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profile_gemm_splitk_impl.hpp
--- a/profiler/include/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_grouped_conv_bwd_weight_impl.hpp
--- a/profiler/include/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profile_grouped_conv_fwd_impl.hpp
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
--- a/profiler/include/profile_groupnorm_impl.hpp
+++ b/profiler/include/profile_groupnorm_impl.hpp
--- a/profiler/include/profile_layernorm_impl.hpp
+++ b/profiler/include/profile_layernorm_impl.hpp
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
--- a/profiler/include/profile_softmax_impl.hpp
+++ b/profiler/include/profile_softmax_impl.hpp
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
+# ckProfiler
+set(PROFILER_SOURCES
+    profiler.cpp
+    profile_gemm.cpp
+    profile_gemm_splitk.cpp
+    profile_gemm_bilinear.cpp
+    profile_gemm_bias_add_reduce.cpp
+    profile_gemm_add_add_fastgelu.cpp
+    profile_gemm_reduce.cpp
+    profile_batched_gemm.cpp
+    profile_batched_gemm_gemm.cpp
+    profile_batched_gemm_add_relu_gemm_add.cpp
+    profile_batched_gemm_reduce.cpp
+    profile_grouped_gemm.cpp
+    profile_conv_fwd.cpp
+    profile_conv_fwd_bias_relu.cpp
+    profile_conv_fwd_bias_relu_add.cpp
+    profile_conv_bwd_data.cpp
+    profile_grouped_conv_fwd.cpp
+    profile_grouped_conv_bwd_weight.cpp
+    profile_reduce.cpp
+    profile_groupnorm.cpp
+    profile_layernorm.cpp
+    profile_softmax.cpp
+    profile_batchnorm_fwd.cpp
+    profile_batchnorm_bwd.cpp
+)
+
+set(PROFILER_EXECUTABLE ckProfiler)
+
+add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
+target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
+
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
+
+rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -7,7 +7,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"

 enum struct GemmMatrixLayout
 {
@@ -25,12 +26,15 @@ enum struct GemmDataType
    INT8_INT8_INT8, // 3
 };

+#define OP_NAME "batched_gemm"
+#define OP_DESC "Batched GEMM"
+
 int profile_batched_gemm(int argc, char* argv[])
 {
    if(argc != 18)
    {
        // clang-format off
-        printf("arg1: tensor operation (batched_gemm: Batched GEMM)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n");
        printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
        printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
@@ -195,3 +199,5 @@ int profile_batched_gemm(int argc, char* argv[])
        return 1;
    }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm);
--- a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
+++ b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
+#include "profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
+#include "profiler_operation_registry.hpp"

 using F16 = ck::half_t;
 using F32 = float;
@@ -14,6 +15,9 @@ using F32 = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;

+#define OP_NAME "batched_gemm_add_relu_gemm_add"
+#define OP_DESC "Batched GEMM+Add+Relu+GEMM+Add"
+
 int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
 {
    enum struct GemmMatrixLayout
@@ -109,8 +113,7 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
    }
    else
    {
-        printf("arg1: tensor operation (batched_gemm_add_relu_gemm_add: "
-               "Batched_GEMM+Add+Relu+Gemm+Add)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (1: fp16)\n");
        printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
               "= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = "
@@ -207,3 +210,5 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_add_relu_gemm_add);
--- a/profiler/src/profile_batched_gemm_gemm.cpp
+++ b/profiler/src/profile_batched_gemm_gemm.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"

 using F16 = ck::half_t;
 using F32 = float;
@@ -14,6 +15,9 @@ using F32 = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;

+#define OP_NAME "batched_gemm_gemm"
+#define OP_DESC "Batched GEMM+GEMM"
+
 int profile_batched_gemm_gemm(int argc, char* argv[])
 {
    enum struct GemmMatrixLayout
@@ -101,7 +105,7 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
    }
    else
    {
-        printf("arg1: tensor operation (batched_gemm_gemm: Batched_GEMM+Gemm)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (1: fp16)\n");
        printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
               "= E1[m, o];  1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = E1[m, "
@@ -179,3 +183,5 @@ int profile_batched_gemm_gemm(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_gemm);
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "batched_gemm_reduce"
+#define OP_DESC "Batched GEMM+Reduce"

 int profile_batched_gemm_reduce(int argc, char* argv[])
 {
@@ -26,7 +30,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])

    if(argc != 15)
    {
-        printf("arg1: tensor operation (batched_gemm_reduce: BatchedGEMM+Reduce)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16)\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -151,3 +155,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_reduce);
--- a/profiler/src/profile_batchnorm_bwd.cpp
+++ b/profiler/src/profile_batchnorm_bwd.cpp
@@ -6,7 +6,8 @@
 #include <getopt.h>

 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_batchnorm_backward_impl.hpp"
+#include "profiler/profile_batchnorm_backward_impl.hpp"
+#include "profiler_operation_registry.hpp"

 using ck::index_t;

@@ -202,3 +203,5 @@ int profile_batchnorm_backward(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION("bnorm_bwd", "Batchnorm backward", profile_batchnorm_backward);
--- a/profiler/src/profile_batchnorm_fwd.cpp
+++ b/profiler/src/profile_batchnorm_fwd.cpp
@@ -6,7 +6,8 @@
 #include <getopt.h>

 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+#include "profiler/profile_batchnorm_forward_impl.hpp"
+#include "profiler_operation_registry.hpp"

 using ck::index_t;

@@ -214,3 +215,5 @@ int profile_batchnorm_forward(int argc, char* argv[])

    return 0;
 }
+
+REGISTER_PROFILER_OPERATION("bnorm_fwd", "Batchnorm forward", profile_batchnorm_forward);
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_conv_bwd_data_impl.hpp"
+#include "profiler/profile_conv_bwd_data_impl.hpp"
+#include "profiler_operation_registry.hpp"

 namespace {

@@ -24,10 +25,13 @@ enum struct ConvDataType
    INT8_INT8_INT8, // 3
 };

+#define OP_NAME "conv_bwd_data"
+#define OP_DESC "Convolution Backward Data"
+
 static void print_helper_msg()
 {
    std::cout
-        << "arg1: tensor operation (conv_bwd_data: Convolution Backward Data)\n"
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
        << "                 1: Input fp16, Weight fp16, Output fp16\n"
        << "                 2: Input bf16, Weight bf16, Output bf16\n"
@@ -182,3 +186,5 @@ int profile_conv_bwd_data(int argc, char* argv[])

    return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_bwd_data);
--- a/profiler/src/profile_conv_fwd.cpp
+++ b/profiler/src/profile_conv_fwd.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>

-#include "profiler/include/profile_conv_fwd_impl.hpp"
+#include "profiler/profile_conv_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"

 namespace {

@@ -24,11 +25,14 @@ enum struct ConvDataType
    INT8_INT8_INT8, // 3
 };

+#define OP_NAME "conv_fwd"
+#define OP_DESC "Convolution Forward"
+
 static void print_helper_msg()
 {
    std::cout
        // clang-format-off
-        << "arg1: tensor operation (conv_fwd: Convolution Forward)\n"
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
        << "                 1: Input fp16, Weight fp16, Output fp16\n"
        << "                 2: Input bf16, Weight bf16, Output bf16\n"
@@ -184,3 +188,5 @@ int profile_conv_fwd(int argc, char* argv[])

    return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd);