Commit b58b98ff authored by Chao Liu's avatar Chao Liu
Browse files

add ckProfiler

parent 3d005816
......@@ -136,7 +136,11 @@ struct TensorAdaptor
using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorAdaptor() = default;
#else
__host__ __device__ constexpr TensorAdaptor() : transforms_{}, element_size_{} {}
#endif
__host__ __device__ constexpr TensorAdaptor(const Transforms& transforms)
: transforms_{transforms}, element_size_{InitializeElementSize(transforms)}
......
......@@ -111,7 +111,14 @@ struct TensorDescriptor
using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorDescriptor() = default;
#else
__host__ __device__ constexpr TensorDescriptor()
: transforms_{}, element_size_{}, element_space_size_{}
{
}
#endif
__host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
ElementSpaceSize element_space_size)
......
......@@ -18,7 +18,7 @@ struct TupleElementKey
template <typename Key, typename Data>
struct TupleElementKeyData
{
#if 0
#if 0 // workaround compiler complaint about implicitly-deleted default constructor
__host__ __device__ constexpr TupleElementKeyData() = default;
#else
__host__ __device__ constexpr TupleElementKeyData() : mData{} {}
......
#pragma once
#include <stdlib.h>
#include <vector>
namespace ck {
namespace tensor_operation {
......
......@@ -12,9 +12,9 @@ namespace device_gemm_instance {
using F16 = ck::half_t;
using F32 = float;
using F16_F16 = ck::Tuple<F16, F16>
using F16_F16 = ck::Tuple<F16, F16>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <ck::index_t... Is>
......@@ -28,7 +28,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// e = elementwise((a * b), d)
// outout: e[m, n]
// input: a[k, m], b[k, n], d[m, n]
using device_gemm_add_add_gelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
// clang-format off
//##############################| ALayout| BLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//##############################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
......
#include <stdlib.h>
#include "config.hpp"
#include "device_gemm_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
#include "device_gemm_multiple_d_xdl_cshuffle.hpp"
namespace ck {
namespace tensor_operation {
......@@ -12,9 +12,9 @@ namespace device_gemm_instance {
using F16 = ck::half_t;
using F32 = float;
using F16_F16 = ck::Tuple<F16, F16>
using F16_F16 = ck::Tuple<F16, F16>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <ck::index_t... Is>
......
......@@ -12,9 +12,9 @@ namespace device_gemm_instance {
using F16 = ck::half_t;
using F32 = float;
using F16_F16 = ck::Tuple<F16, F16>
using F16_F16 = ck::Tuple<F16, F16>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <ck::index_t... Is>
......
......@@ -24,7 +24,7 @@ include_directories(BEFORE
# ck_profiler
set(PROFILER_SOURCE
src/profiler.cpp
src/profile_gemm.cpp
# src/profile_gemm.cpp
# src/profile_gemm_bias_2d.cpp
# src/profile_gemm_bias_relu.cpp
# src/profile_gemm_bias_relu_add.cpp
......@@ -47,7 +47,7 @@ add_executable(ckProfiler ${PROFILER_SOURCE})
target_link_libraries(ckProfiler PRIVATE host_tensor)
target_link_libraries(ckProfiler PRIVATE conv_util)
#target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance)
......
......@@ -11,8 +11,8 @@
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
#include "device_gemm.hpp"
#include "reference_gemm.hpp"
#include "device_gemm_multiple_d.hpp"
namespace ck {
namespace tensor_operation {
......@@ -23,7 +23,7 @@ using DeviceGemmAddAddFastGeluPtr = ck::tensor_operation::device::DeviceGemmMult
2,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::FastGelu>;
ck::tensor_operation::element_wise::AddAddFastGelu>;
void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
std::vector<DeviceGemmAddAddFastGeluPtr>&);
......@@ -44,6 +44,7 @@ namespace profiler {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename D0DataType,
typename D1DataType,
typename EDataType,
......@@ -54,7 +55,7 @@ template <typename ADataType,
typename ELayout>
int profile_gemm_add_add_fastgelu_impl(int do_verification,
int init_method,
bool do_log,
bool /*do_log*/,
bool time_kernel,
int M,
int N,
......@@ -131,28 +132,32 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(device_op_ptrs);
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
device_op_ptrs);
}
else if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor> &&
is_same_v<BLayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(device_op_ptrs);
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
device_op_ptrs);
}
else if constexpr(is_same_v<ALayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<BLayout, tensor_layout::gemm::RowMajor> &&
is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(device_op_ptrs);
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
device_op_ptrs);
}
else if constexpr(is_same_v<ALayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<BLayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(device_op_ptrs);
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
device_op_ptrs);
}
}
......
......@@ -23,7 +23,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
enum struct MatrixDataType
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F16_F16_F16_F16, // 1
F16_F16_F16_F16_F16, // 1
BF16_BF16_BF16_BF16_BF16, // 2
INT8_INT8_INT8_INT8_INT8, // 3
};
......@@ -31,7 +31,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
if(argc != 16)
{
// clang-format off
printf("arg1: tensor operation (gemm_gelu: GEMM+Add+Add+GeLU)\n");
printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+GeLU)\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
printf(" 1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");
......@@ -40,7 +40,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
// clang-format on
exit(1);
......@@ -64,12 +64,14 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
const int StrideE = std::stoi(argv[15]);
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto d0_type,
auto d1_type,
auto e_type,
......@@ -80,6 +82,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
auto e_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using D0DataType = decltype(d0_type);
using D1DataType = decltype(d1_type);
using EDataType = decltype(e_type);
......@@ -96,8 +99,9 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
return ck::profiler::profile_gemm_add_add_gelu_impl<ADataType,
return ck::profiler::profile_gemm_add_add_fastgelu_impl<ADataType,
BDataType,
AccDataType,
D0DataType,
D1DataType,
EDataType,
......@@ -122,22 +126,22 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN)
{
return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::MK_NK_MN_MN_MN)
{
return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::KM_KN_MN_MN_MN)
{
return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{}, Row{});
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::KM_NK_MN_MN_MN)
{
return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{}, Row{});
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{}, Row{});
}
else
{
......
......@@ -54,11 +54,11 @@ int main(int argc, char* argv[])
return 0;
}
#if 0
if(strcmp(argv[1], "gemm") == 0)
{
return profile_gemm(argc, argv);
}
#if 0
else if(strcmp(argv[1], "gemm_bias_2d") == 0)
{
return profile_gemm_bias_2d(argc, argv);
......@@ -124,7 +124,7 @@ int main(int argc, char* argv[])
return profile_conv_bwd_weight(argc, argv);
}
#endif
else if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
{
return profile_gemm_add_add_fastgelu(argc, argv);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment