Commit ca313a29 authored by letaoqin's avatar letaoqin
Browse files

Merge branch 'develop' into dl_conv_multiple_d

parents d47bf127 8784a72e
# ckProfiler
set(PROFILER_SOURCES
profiler.cpp
profile_gemm.cpp
profile_gemm_splitk.cpp
profile_gemm_bilinear.cpp
profile_gemm_bias_add_reduce.cpp
profile_gemm_add_add_fastgelu.cpp
profile_gemm_reduce.cpp
profile_batched_gemm.cpp
profile_batched_gemm_gemm.cpp
profile_batched_gemm_add_relu_gemm_add.cpp
profile_batched_gemm_reduce.cpp
profile_grouped_gemm.cpp
profile_conv_fwd.cpp
profile_conv_fwd_bias_relu.cpp
profile_conv_fwd_bias_relu_add.cpp
profile_conv_bwd_data.cpp
profile_grouped_conv_fwd.cpp
profile_grouped_conv_bwd_weight.cpp
profile_reduce.cpp
profile_groupnorm.cpp
profile_layernorm.cpp
profile_softmax.cpp
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
)
set(PROFILER_EXECUTABLE ckProfiler)
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
......@@ -7,7 +7,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_impl.hpp"
#include "profiler/profile_batched_gemm_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
......@@ -25,12 +26,15 @@ enum struct GemmDataType
INT8_INT8_INT8, // 3
};
#define OP_NAME "batched_gemm"
#define OP_DESC "Batched GEMM"
int profile_batched_gemm(int argc, char* argv[])
{
if(argc != 18)
{
// clang-format off
printf("arg1: tensor operation (batched_gemm: Batched GEMM)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n");
printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
printf(" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
......@@ -195,3 +199,5 @@ int profile_batched_gemm(int argc, char* argv[])
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm);
......@@ -6,7 +6,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
#include "profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
#include "profiler_operation_registry.hpp"
using F16 = ck::half_t;
using F32 = float;
......@@ -14,6 +15,9 @@ using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
#define OP_NAME "batched_gemm_add_relu_gemm_add"
#define OP_DESC "Batched GEMM+Add+Relu+GEMM+Add"
int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
{
enum struct GemmMatrixLayout
......@@ -109,8 +113,7 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
}
else
{
printf("arg1: tensor operation (batched_gemm_add_relu_gemm_add: "
"Batched_GEMM+Add+Relu+Gemm+Add)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (1: fp16)\n");
printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
"= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = "
......@@ -207,3 +210,5 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_add_relu_gemm_add);
......@@ -6,7 +6,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
#include "profiler/profile_batched_gemm_gemm_impl.hpp"
#include "profiler_operation_registry.hpp"
using F16 = ck::half_t;
using F32 = float;
......@@ -14,6 +15,9 @@ using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
#define OP_NAME "batched_gemm_gemm"
#define OP_DESC "Batched GEMM+GEMM"
int profile_batched_gemm_gemm(int argc, char* argv[])
{
enum struct GemmMatrixLayout
......@@ -101,7 +105,7 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
}
else
{
printf("arg1: tensor operation (batched_gemm_gemm: Batched_GEMM+Gemm)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (1: fp16)\n");
printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
"= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = E1[m, "
......@@ -179,3 +183,5 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_gemm);
......@@ -6,7 +6,11 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
#include "profiler/profile_batched_gemm_reduce_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "batched_gemm_reduce"
#define OP_DESC "Batched GEMM+Reduce"
int profile_batched_gemm_reduce(int argc, char* argv[])
{
......@@ -26,7 +30,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
if(argc != 15)
{
printf("arg1: tensor operation (batched_gemm_reduce: BatchedGEMM+Reduce)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
......@@ -151,3 +155,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_reduce);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment