Commit 0c823497 authored by muozturk's avatar muozturk
Browse files

merge

parents 334cfe1c 68f2b5e7
...@@ -93,6 +93,26 @@ static auto make_ref_op() ...@@ -93,6 +93,26 @@ static auto make_ref_op()
} }
} }
template <typename InputLayout>
static auto create_gemm_desc(const ck::index_t G, const ck::index_t NDoHoWo, const ck::index_t CZYX)
{
using namespace ck::tensor_layout::convolution;
if constexpr(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
std::is_same_v<InputLayout, GNDHWC>)
{
return HostTensorDescriptor({G, NDoHoWo, CZYX});
}
else if constexpr(std::is_same_v<InputLayout, NWGC> || std::is_same_v<InputLayout, NHWGC> ||
std::is_same_v<InputLayout, NDHWGC>)
{
return HostTensorDescriptor({G, NDoHoWo, CZYX}, {CZYX, CZYX * G, 1});
}
else
{
throw std::runtime_error("Unsupported layout!");
}
}
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename InputLayout, typename InputLayout,
typename InputDataType, typename InputDataType,
...@@ -116,13 +136,13 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, ...@@ -116,13 +136,13 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
const auto image_desc = const auto image_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>( ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>(
conv_param); conv_param);
const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX}); const auto gemm_desc = create_gemm_desc<InputLayout>(conv_param.G_, NDoHoWo, CZYX);
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{}; std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
std::array<ck::index_t, 2> gemm_m_k_strides{}; std::array<ck::index_t, 3> gemm_g_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{}; std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
...@@ -134,7 +154,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, ...@@ -134,7 +154,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_param.output_spatial_lengths_, output_spatial_lengths); copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
copy(image_desc.GetStrides(), image_g_n_c_wis_strides); copy(image_desc.GetStrides(), image_g_n_c_wis_strides);
copy(gemm_desc.GetStrides(), gemm_m_k_strides); copy(gemm_desc.GetStrides(), gemm_g_m_k_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations); copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_left_pads_, input_left_pads);
...@@ -212,13 +232,14 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, ...@@ -212,13 +232,14 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InputDataType*>(in_device_buf.GetDeviceBuffer()), static_cast<InputDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutputDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutputDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.G_,
conv_param.N_, conv_param.N_,
conv_param.C_, conv_param.C_,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -234,7 +255,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, ...@@ -234,7 +255,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
float avg_time = float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_btype = std::size_t num_btype =
NDoHoWo * CZYX * (sizeof(OutputDataType) + sizeof(InputDataType)); conv_param.G_ * NDoHoWo * CZYX * (sizeof(OutputDataType) + sizeof(InputDataType));
float gb_per_sec = num_btype / 1.E6 / avg_time; float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl; << op_name << std::endl;
......
...@@ -80,6 +80,8 @@ bool profile_elementwise_layernorm_impl(int do_verification, ...@@ -80,6 +80,8 @@ bool profile_elementwise_layernorm_impl(int do_verification,
Tensor<BetaDataType> beta(gammaBetaLength); Tensor<BetaDataType> beta(gammaBetaLength);
Tensor<YDataType> y(length); Tensor<YDataType> y(length);
Tensor<YDataType> host_y(length); Tensor<YDataType> host_y(length);
Tensor<AccDataType> host_save_mean({M});
Tensor<AccDataType> host_save_inv_std({M});
switch(init_method) switch(init_method)
{ {
...@@ -152,14 +154,23 @@ bool profile_elementwise_layernorm_impl(int do_verification, ...@@ -152,14 +154,23 @@ bool profile_elementwise_layernorm_impl(int do_verification,
BetaDataType, BetaDataType,
YDataType, YDataType,
AccDataType, AccDataType,
AccDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
ReferenceInstance ref; ReferenceInstance ref;
auto ref_argument = auto ref_argument = ref.MakeArgument(x,
ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4); gamma,
auto ref_invoker = ref.MakeInvoker(); beta,
host_y,
host_save_mean,
host_save_inv_std,
PassThrough{},
{M, N},
{1},
1e-4);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
......
...@@ -66,12 +66,15 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n, ...@@ -66,12 +66,15 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
BetaDataType, BetaDataType,
HDataType, HDataType,
AccDataType, AccDataType,
AccDataType,
HElementOp, HElementOp,
2, 2,
1>; 1>;
Tensor<EMeanVarDataType> e_m_n(HostTensorDescriptor{M, N}); Tensor<EMeanVarDataType> e_m_n(HostTensorDescriptor{M, N});
Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N}); Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
Tensor<AccDataType> save_mean({M});
Tensor<AccDataType> save_inv_std({M});
auto ref_gemm = ReferenceGemm{}; auto ref_gemm = ReferenceGemm{};
auto ref_gemm_invoker = ref_gemm.MakeInvoker(); auto ref_gemm_invoker = ref_gemm.MakeInvoker();
...@@ -97,7 +100,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n, ...@@ -97,7 +100,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
auto ref_layernorm_invoker = ref_layernorm.MakeInvoker(); auto ref_layernorm_invoker = ref_layernorm.MakeInvoker();
auto ref_layernorm_argument = ref_layernorm.MakeArgument( auto ref_layernorm_argument = ref_layernorm.MakeArgument(
e_m_n, gamma_n, beta_n, h_m_n, h_element_op, {M, N}, {1}, epsilon); e_m_n, gamma_n, beta_n, h_m_n, save_mean, save_inv_std, h_element_op, {M, N}, {1}, epsilon);
ref_layernorm_invoker.Run(ref_layernorm_argument); ref_layernorm_invoker.Run(ref_layernorm_argument);
} }
......
...@@ -75,8 +75,8 @@ int profile_gemm_impl(int do_verification, ...@@ -75,8 +75,8 @@ int profile_gemm_impl(int do_verification,
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break; break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 0.1});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.05, 0.05});
} }
using AElementOp = ck::tensor_operation::element_wise::PassThrough; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
......
...@@ -30,7 +30,8 @@ template <typename ADataType, ...@@ -30,7 +30,8 @@ template <typename ADataType,
typename CDataType, typename CDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout> typename CLayout,
typename ComputeType = CDataType>
bool profile_gemm_splitk_impl(int do_verification, bool profile_gemm_splitk_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
...@@ -103,7 +104,8 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -103,7 +104,8 @@ bool profile_gemm_splitk_impl(int do_verification,
CDataType, CDataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CElementOp>; CElementOp,
ComputeType>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -120,7 +122,8 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -120,7 +122,8 @@ bool profile_gemm_splitk_impl(int do_verification,
AccDataType, AccDataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CElementOp>; CElementOp,
ComputeType>;
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -140,8 +143,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -140,8 +143,7 @@ bool profile_gemm_splitk_impl(int do_verification,
// profile device GEMM instances // profile device GEMM instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 36, 40, 60, std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 32, 36, 40, 64, 96, 128};
64, 72, 80, 88, 96, 128, 144, 160, 176, 192, 256};
if(KBatch > 0) if(KBatch > 0)
{ {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -21,8 +21,10 @@ namespace profiler { ...@@ -21,8 +21,10 @@ namespace profiler {
template <typename XDataType, template <typename XDataType,
typename GammaDataType, typename GammaDataType,
typename BetaDataType, typename BetaDataType,
typename AccDataType, typename ComputeDataType,
typename YDataType> typename YDataType,
typename SaveMeanInvStdDataType,
bool SaveMeanInvStd>
bool profile_groupnorm_impl(int do_verification, bool profile_groupnorm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
...@@ -34,6 +36,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -34,6 +36,7 @@ bool profile_groupnorm_impl(int do_verification,
if(length.size() != 5) if(length.size() != 5)
return false; return false;
index_t N = length[0];
index_t G = length[3]; index_t G = length[3];
index_t C = length[4]; index_t C = length[4];
...@@ -45,7 +48,14 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -45,7 +48,14 @@ bool profile_groupnorm_impl(int do_verification,
Tensor<GammaDataType> gamma(gammaBetaLength); Tensor<GammaDataType> gamma(gammaBetaLength);
Tensor<BetaDataType> beta(gammaBetaLength); Tensor<BetaDataType> beta(gammaBetaLength);
Tensor<YDataType> y(length); Tensor<YDataType> y(length);
Tensor<SaveMeanInvStdDataType> save_mean({N, G});
Tensor<SaveMeanInvStdDataType> save_inv_std({N, G});
Tensor<YDataType> host_y(length); Tensor<YDataType> host_y(length);
Tensor<SaveMeanInvStdDataType> host_save_mean({N, G});
Tensor<SaveMeanInvStdDataType> host_save_inv_std({N, G});
std::vector<index_t> strideSaveMeanInvStd = {1};
switch(init_method) switch(init_method)
{ {
...@@ -69,20 +79,23 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -69,20 +79,23 @@ bool profile_groupnorm_impl(int do_verification,
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem save_mean_dev(sizeof(SaveMeanInvStdDataType) * save_mean.mDesc.GetElementSpaceSize());
DeviceMem save_inv_std_dev(sizeof(SaveMeanInvStdDataType) *
save_inv_std.mDesc.GetElementSpaceSize());
x_dev.ToDevice(x.mData.data()); x_dev.ToDevice(x.mData.data());
gamma_dev.ToDevice(gamma.mData.data()); gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.mData.data());
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, YDataType,
YDataType, SaveMeanInvStdDataType,
PassThrough, PassThrough,
5, 5,
3>; 3>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
...@@ -97,38 +110,70 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -97,38 +110,70 @@ bool profile_groupnorm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType, using ReferenceInstance =
GammaDataType, ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
BetaDataType, GammaDataType,
YDataType, BetaDataType,
AccDataType, YDataType,
PassThrough>; SaveMeanInvStdDataType,
ComputeDataType,
PassThrough>;
ReferenceInstance ref; ReferenceInstance ref;
auto ref_argument = ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, 1e-6); auto ref_argument = ref.MakeArgument(
auto ref_invoker = ref.MakeInvoker(); x, gamma, beta, host_y, host_save_mean, host_save_inv_std, PassThrough{}, length, 1e-6);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0; int num_kernel = 0;
auto f_get_argument = [&](auto& inst_ptr) {
if constexpr(SaveMeanInvStd)
return inst_ptr->MakeArgumentPointer(
length,
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
gammaBetaStride,
gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
save_mean.mDesc.GetStrides().end()},
std::vector<ck::index_t>{save_inv_std.mDesc.GetStrides().begin(),
save_inv_std.mDesc.GetStrides().end()},
reduce_dim,
1e-6,
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
save_mean_dev.GetDeviceBuffer(),
save_inv_std_dev.GetDeviceBuffer(),
PassThrough{});
else
return inst_ptr->MakeArgumentPointer(
length,
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
gammaBetaStride,
gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
save_mean.mDesc.GetStrides().end()},
std::vector<ck::index_t>{save_inv_std.mDesc.GetStrides().begin(),
save_inv_std.mDesc.GetStrides().end()},
reduce_dim,
1e-6,
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{});
};
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = f_get_argument(inst_ptr);
length,
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
gammaBetaStride,
gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
reduce_dim,
1e-6,
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
...@@ -152,6 +197,10 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -152,6 +197,10 @@ bool profile_groupnorm_impl(int do_verification,
beta.mDesc.GetElementSize() * sizeof(BetaDataType) + beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType); y.mDesc.GetElementSize() * sizeof(YDataType);
if constexpr(SaveMeanInvStd)
num_bytes += save_mean.mDesc.GetElementSpaceSize() * sizeof(SaveMeanInvStdDataType) +
save_inv_std.mDesc.GetElementSpaceSize() * sizeof(SaveMeanInvStdDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
if(time_kernel) if(time_kernel)
...@@ -168,9 +217,22 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -168,9 +217,22 @@ bool profile_groupnorm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
y_dev.FromDevice(y.mData.data()); y_dev.FromDevice(y.mData.data());
bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3); bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
if constexpr(SaveMeanInvStd)
{
save_mean_dev.FromDevice(save_mean.mData.data());
pass &= ck::utils::check_err(
save_mean.mData, host_save_mean.mData, "Error: Incorrect results", 1e-3, 1e-3);
save_inv_std_dev.FromDevice(save_inv_std.mData.data());
pass &= ck::utils::check_err(save_inv_std.mData,
host_save_inv_std.mData,
"Error: Incorrect results",
1e-3,
1e-3);
}
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -21,6 +21,8 @@ template <typename XDataType, ...@@ -21,6 +21,8 @@ template <typename XDataType,
typename BetaDataType, typename BetaDataType,
typename ComputeDataType, typename ComputeDataType,
typename YDataType, typename YDataType,
typename SaveMeanInvStdDataType,
bool SaveMeanInvStd,
index_t Rank> index_t Rank>
bool profile_layernorm_impl(int do_verification, bool profile_layernorm_impl(int do_verification,
int init_method, int init_method,
...@@ -43,13 +45,19 @@ bool profile_layernorm_impl(int do_verification, ...@@ -43,13 +45,19 @@ bool profile_layernorm_impl(int do_verification,
Tensor<GammaDataType> gamma(reduce_length); Tensor<GammaDataType> gamma(reduce_length);
Tensor<BetaDataType> beta(reduce_length); Tensor<BetaDataType> beta(reduce_length);
Tensor<YDataType> y(length); Tensor<YDataType> y(length);
Tensor<SaveMeanInvStdDataType> save_mean({length[0]});
Tensor<SaveMeanInvStdDataType> save_inv_std({length[0]});
Tensor<YDataType> host_y(length); Tensor<YDataType> host_y(length);
Tensor<SaveMeanInvStdDataType> host_save_mean({length[0]});
Tensor<SaveMeanInvStdDataType> host_save_inv_std({length[0]});
std::vector<index_t> strideXY = std::vector<index_t> strideXY =
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}; std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
std::vector<index_t> strideGammaBeta = strideXY; std::vector<index_t> strideGammaBeta = strideXY;
strideGammaBeta[0] = 0; strideGammaBeta[0] = 0;
std::vector<index_t> strideSaveMeanInvStd = {1};
switch(init_method) switch(init_method)
{ {
case 0: case 0:
...@@ -75,6 +83,9 @@ bool profile_layernorm_impl(int do_verification, ...@@ -75,6 +83,9 @@ bool profile_layernorm_impl(int do_verification,
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem save_mean_dev(sizeof(SaveMeanInvStdDataType) * save_mean.mDesc.GetElementSpaceSize());
DeviceMem save_inv_std_dev(sizeof(SaveMeanInvStdDataType) *
save_inv_std.mDesc.GetElementSpaceSize());
x_dev.ToDevice(x.mData.data()); x_dev.ToDevice(x.mData.data());
gamma_dev.ToDevice(gamma.mData.data()); gamma_dev.ToDevice(gamma.mData.data());
...@@ -83,14 +94,14 @@ bool profile_layernorm_impl(int do_verification, ...@@ -83,14 +94,14 @@ bool profile_layernorm_impl(int do_verification,
constexpr int NumReduceDim = Rank - 1; constexpr int NumReduceDim = Rank - 1;
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
ComputeDataType, YDataType,
YDataType, SaveMeanInvStdDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
...@@ -105,40 +116,74 @@ bool profile_layernorm_impl(int do_verification, ...@@ -105,40 +116,74 @@ bool profile_layernorm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType, using ReferenceInstance =
GammaDataType, ck::tensor_operation::host::ReferenceLayernorm<XDataType,
BetaDataType, GammaDataType,
YDataType, BetaDataType,
ComputeDataType, YDataType,
PassThrough, SaveMeanInvStdDataType,
Rank, ComputeDataType,
NumReduceDim>; PassThrough,
Rank,
NumReduceDim>;
ReferenceInstance ref; ReferenceInstance ref;
auto ref_argument = auto ref_argument = ref.MakeArgument(x,
ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, reduce_dim, 1e-4); gamma,
auto ref_invoker = ref.MakeInvoker(); beta,
host_y,
host_save_mean,
host_save_inv_std,
PassThrough{},
length,
reduce_dim,
1e-4);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0; int num_kernel = 0;
auto f_get_argument = [&](auto& inst_ptr) {
if constexpr(SaveMeanInvStd)
return inst_ptr->MakeArgumentPointer(length,
strideXY,
strideGammaBeta,
strideGammaBeta,
strideXY,
strideSaveMeanInvStd,
strideSaveMeanInvStd,
reduce_dim,
1e-4,
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
save_mean_dev.GetDeviceBuffer(),
save_inv_std_dev.GetDeviceBuffer(),
PassThrough{});
else
return inst_ptr->MakeArgumentPointer(length,
strideXY,
strideGammaBeta,
strideGammaBeta,
strideXY,
strideSaveMeanInvStd,
strideSaveMeanInvStd,
reduce_dim,
1e-4,
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{});
};
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer(length, auto argument_ptr = f_get_argument(inst_ptr);
strideXY,
strideGammaBeta,
strideGammaBeta,
strideXY,
reduce_dim,
1e-4,
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
...@@ -168,6 +213,10 @@ bool profile_layernorm_impl(int do_verification, ...@@ -168,6 +213,10 @@ bool profile_layernorm_impl(int do_verification,
beta.mDesc.GetElementSize() * sizeof(BetaDataType) + beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType); y.mDesc.GetElementSize() * sizeof(YDataType);
if constexpr(SaveMeanInvStd)
num_bytes += save_mean.mDesc.GetElementSpaceSize() * sizeof(SaveMeanInvStdDataType) +
save_inv_std.mDesc.GetElementSpaceSize() * sizeof(SaveMeanInvStdDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
if(time_kernel) if(time_kernel)
...@@ -184,10 +233,23 @@ bool profile_layernorm_impl(int do_verification, ...@@ -184,10 +233,23 @@ bool profile_layernorm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
y_dev.FromDevice(y.mData.data()); y_dev.FromDevice(y.mData.data());
bool pass = bool pass =
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if constexpr(SaveMeanInvStd)
{
save_mean_dev.FromDevice(save_mean.mData.data());
pass &= ck::utils::check_err(
save_mean.mData, host_save_mean.mData, "Error: Incorrect results", 1e-3, 1e-3);
save_inv_std_dev.FromDevice(save_inv_std.mData.data());
pass &= ck::utils::check_err(save_inv_std.mData,
host_save_inv_std.mData,
"Error: Incorrect results",
1e-3,
1e-3);
}
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace profiler {
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
{
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d)
for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h)
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
{
auto a_val = A_ncdhw(n, c, d, h, w);
functor(B_nchwd(n, c, h, w, d), a_val);
}
}
template <typename ADataType, typename BDataType, index_t NumDim>
bool profile_transpose_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> lengths)
{
bool pass = true;
index_t N = lengths[0];
index_t C = lengths[1];
index_t D = lengths[2];
index_t H = lengths[3];
index_t W = lengths[4];
std::vector<ck::index_t> ncdhw = {N, C, D, H, W};
std::vector<ck::index_t> ndhwc = {N, D, H, W, C};
Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(ndhwc);
Tensor<BDataType> host_b(ndhwc);
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
std::cout << "A: " << a.mDesc << std::endl;
std::cout << "B: " << b.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
}
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
// const auto element_op = ElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
using DeviceOp = ck::tensor_operation::device::
DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, NumDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
if(do_verification)
{
host_elementwise4D(host_b, a, ElementOp{});
}
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init C to zero before profiling next kernel
b_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
}
}
std::string op_name = op_ptr->GetTypeString();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop =
std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
std::size_t num_btype =
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W
<< " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
<< " GB/s, " << best_op_name << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
...@@ -16,8 +16,8 @@ set(PROFILER_SOURCES ...@@ -16,8 +16,8 @@ set(PROFILER_SOURCES
profile_grouped_conv_fwd.cpp profile_grouped_conv_fwd.cpp
profile_grouped_conv_bwd_weight.cpp profile_grouped_conv_bwd_weight.cpp
profile_reduce.cpp profile_reduce.cpp
profile_groupnorm.cpp profile_groupnorm_fwd.cpp
profile_layernorm.cpp profile_layernorm_fwd.cpp
profile_max_pool3d_fwd.cpp profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp profile_max_pool3d_bwd.cpp
...@@ -25,14 +25,14 @@ set(PROFILER_SOURCES ...@@ -25,14 +25,14 @@ set(PROFILER_SOURCES
profile_batchnorm_fwd.cpp profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp profile_batchnorm_infer.cpp
profile_contraction_bilinear.cpp
profile_contraction_scale.cpp
profile_grouped_conv_bwd_data.cpp profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp profile_conv_tensor_rearrange.cpp
) )
if(DL_KERNELS) if(DL_KERNELS)
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
endif() endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
...@@ -46,6 +46,11 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) ...@@ -46,6 +46,11 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
endif() endif()
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
endif()
set(PROFILER_EXECUTABLE ckProfiler) set(PROFILER_EXECUTABLE ckProfiler)
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES}) add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
...@@ -72,12 +77,10 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_w ...@@ -72,12 +77,10 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_w
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
...@@ -85,9 +88,18 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d ...@@ -85,9 +88,18 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
endif()
if(DL_KERNELS) if(DL_KERNELS)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
endif() endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
...@@ -100,4 +112,5 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) ...@@ -100,4 +112,5 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
endif() endif()
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: f64)\n" << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + " << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + " << " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
...@@ -26,40 +27,42 @@ static void print_helper_msg() ...@@ -26,40 +27,42 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + " << " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
<< "arg4: verification (0: no; 1: yes)\n" << "arg5: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal " << "arg6: initialization (0: no init; 1: integer value; 2: decimal "
<< "value)\n" << "value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< "arg8 and arg9: alpha and beta\n" << "arg9: alpha\n"
<< "arg10 to 15: M0, M1, N0, N1, K0, K1\n" << "arg10: beta\n"
<< "arg16 to 31: Strides for A, B, D and E (skip for default)\n" << "arg11 to 16: M0, M1, N0, N1, K0, K1\n"
<< "arg17 to 32: Strides for A, B, D and E (skip for default)\n"
<< std::endl; << std::endl;
} }
int profile_contraction_bilinear(int argc, char* argv[]) int profile_contraction_bilinear(int argc, char* argv[])
{ {
const bool default_strides = argc == 16; const bool default_strides = argc == 17;
if(argc != 32 && argc != 16) if(argc != 33 && argc != 17)
{ {
print_helper_msg(); print_helper_msg();
exit(1); exit(1);
} }
const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[3])); const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[4]));
const ck::index_t init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const ck::index_t init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const float alpha = std::stof(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const float beta = std::stof(argv[9]); const float alpha = std::stof(argv[9]);
const float beta = std::stof(argv[10]);
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
const ck::index_t dims_arg_num = 10; const ck::index_t dims_arg_num = 11;
collect_index_params(argv, M, dims_arg_num, 2); collect_index_params(argv, M, dims_arg_num, 2);
collect_index_params(argv, N, dims_arg_num + 2, 2); collect_index_params(argv, N, dims_arg_num + 2, 2);
collect_index_params(argv, K, dims_arg_num + 4, 2); collect_index_params(argv, K, dims_arg_num + 4, 2);
...@@ -76,90 +79,130 @@ int profile_contraction_bilinear(int argc, char* argv[]) ...@@ -76,90 +79,130 @@ int profile_contraction_bilinear(int argc, char* argv[])
collect_index_params(argv, StridesD, dims_arg_num + 18, 4); collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
} }
using F32 = float; using F16 = ck::half_t;
using F64 = double; using BF16 = ck::bhalf_t;
using F32 = float;
auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) { using F64 = double;
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); auto profile =
using CDELayout = decltype(cde_layout); [&](auto a_layout, auto b_layout, auto cde_layout, auto type, auto compute_type) {
using ALayout = decltype(a_layout);
using DataType = decltype(type); using BLayout = decltype(b_layout);
using CDELayout = decltype(cde_layout);
if(default_strides)
using DataType = decltype(type);
using ComputeDataType = decltype(compute_type);
if(default_strides)
{
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
}
bool pass = ck::profiler::profile_contraction_impl<ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<DataType>,
Bilinear>(do_verification,
init_method,
do_log,
time_kernel,
Bilinear{alpha, beta},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
};
auto run_profile_for_datatype = [&](auto type, auto compute_type) {
if(layout == ContractionMatrixLayout::MK_KN_MN_MN)
{ {
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]}); return profile(Row{}, Row{}, Row{}, type, compute_type);
assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
} }
bool pass = ck::profiler::profile_contraction_impl<ALayout, else if(layout == ContractionMatrixLayout::MK_NK_MN_MN)
BLayout, {
CDELayout, return profile(Row{}, Col{}, Row{}, type, compute_type);
DataType, }
ck::Tuple<DataType>, else if(layout == ContractionMatrixLayout::KM_KN_MN_MN)
Bilinear>(do_verification, {
init_method, return profile(Col{}, Row{}, Row{}, type, compute_type);
do_log, }
time_kernel, else if(layout == ContractionMatrixLayout::KM_NK_MN_MN)
Bilinear{alpha, beta}, {
M, return profile(Col{}, Col{}, Row{}, type, compute_type);
N, }
K, return false;
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
}; };
if(data_type == ContractionDataType::F32_F32_F32_F32 && if(data_type == ContractionDataType::F32_F32_F32_F32)
layout == ContractionMatrixLayout::MK_KN_MN_MN)
{
return profile(Row{}, Row{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{ {
return profile(Row{}, Col{}, Row{}, F32{}); if(compute_data_type == ContractionComputeDataType::F32)
} {
else if(data_type == ContractionDataType::F32_F32_F32_F32 && return run_profile_for_datatype(F32{}, F32{});
layout == ContractionMatrixLayout::KM_KN_MN_MN) }
{ else if(compute_data_type == ContractionComputeDataType::F16)
return profile(Col{}, Row{}, Row{}, F32{}); {
} return run_profile_for_datatype(F32{}, F16{});
else if(data_type == ContractionDataType::F32_F32_F32_F32 && }
layout == ContractionMatrixLayout::KM_NK_MN_MN) else if(compute_data_type == ContractionComputeDataType::BF16)
{ {
return profile(Col{}, Col{}, Row{}, F32{}); return run_profile_for_datatype(F32{}, BF16{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else
layout == ContractionMatrixLayout::MK_KN_MN_MN) {
{ std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return profile(Row{}, Row{}, Row{}, F64{}); return 1;
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{
return profile(Row{}, Col{}, Row{}, F64{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F64_F64_F64_F64)
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F64)
{
return run_profile_for_datatype(F64{}, F64{});
}
else if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F64{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F16_F16_F16_F16)
layout == ContractionMatrixLayout::KM_NK_MN_MN)
{ {
return profile(Col{}, Col{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else else if(data_type == ContractionDataType::BF16_BF16_BF16_BF16)
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; if(compute_data_type == ContractionComputeDataType::F32)
{
return 1; return run_profile_for_datatype(BF16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
return 1;
} }
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_bilinear); REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_bilinear);
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: f64)\n" << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + " << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + " << " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
...@@ -26,39 +27,40 @@ static void print_helper_msg() ...@@ -26,39 +27,40 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + " << " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
<< "arg4: verification (0: no; 1: yes)\n" << "arg5: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal " << "arg6: initialization (0: no init; 1: integer value; 2: decimal "
<< "value)\n" << "value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< "arg8: alpha\n" << "arg9: alpha\n"
<< "arg9 to 14: M0, M1, N0, N1, K0, K1\n" << "arg10 to 15: M0, M1, N0, N1, K0, K1\n"
<< "arg15 to 30: Strides for A, B, D and E (skip for default)\n" << "arg16 to 31: Strides for A, B, D and E (skip for default)\n"
<< std::endl; << std::endl;
} }
int profile_contraction_scale(int argc, char* argv[]) int profile_contraction_scale(int argc, char* argv[])
{ {
const bool default_strides = argc == 15; const bool default_strides = argc == 16;
if(argc != 31 && argc != 15) if(argc != 32 && argc != 16)
{ {
print_helper_msg(); print_helper_msg();
exit(1); exit(1);
} }
const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[3])); const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[4]));
const ck::index_t init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const ck::index_t init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const float alpha = std::stof(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const float alpha = std::stof(argv[9]);
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
const ck::index_t dims_arg_num = 9; const ck::index_t dims_arg_num = 10;
collect_index_params(argv, M, dims_arg_num, 2); collect_index_params(argv, M, dims_arg_num, 2);
collect_index_params(argv, N, dims_arg_num + 2, 2); collect_index_params(argv, N, dims_arg_num + 2, 2);
collect_index_params(argv, K, dims_arg_num + 4, 2); collect_index_params(argv, K, dims_arg_num + 4, 2);
...@@ -75,88 +77,131 @@ int profile_contraction_scale(int argc, char* argv[]) ...@@ -75,88 +77,131 @@ int profile_contraction_scale(int argc, char* argv[])
collect_index_params(argv, StridesD, dims_arg_num + 18, 4); collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
} }
using F32 = float; using F16 = ck::half_t;
using F64 = double; using BF16 = ck::bhalf_t;
using F32 = float;
auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) { using F64 = double;
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); auto profile =
using CDELayout = decltype(cde_layout); [&](auto a_layout, auto b_layout, auto cde_layout, auto type, auto compute_type) {
using ALayout = decltype(a_layout);
using DataType = decltype(type); using BLayout = decltype(b_layout);
using CDELayout = decltype(cde_layout);
if(default_strides)
using DataType = decltype(type);
using ComputeDataType = decltype(compute_type);
if(default_strides)
{
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
}
bool pass = ck::profiler::profile_contraction_impl<ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<>,
Scale>(do_verification,
init_method,
do_log,
time_kernel,
Scale{alpha},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
};
auto run_profile_for_datatype = [&](auto type, auto compute_type) {
if(layout == ContractionMatrixLayout::MK_KN_MN_MN)
{ {
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]}); return profile(Row{}, Row{}, Row{}, type, compute_type);
assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
} }
else if(layout == ContractionMatrixLayout::MK_NK_MN_MN)
bool pass = ck::profiler:: {
profile_contraction_impl<ALayout, BLayout, CDELayout, DataType, ck::Tuple<>, Scale>( return profile(Row{}, Col{}, Row{}, type, compute_type);
do_verification, }
init_method, else if(layout == ContractionMatrixLayout::KM_KN_MN_MN)
do_log, {
time_kernel, return profile(Col{}, Row{}, Row{}, type, compute_type);
Scale{alpha}, }
M, else if(layout == ContractionMatrixLayout::KM_NK_MN_MN)
N, {
K, return profile(Col{}, Col{}, Row{}, type, compute_type);
StridesA, }
StridesB, return false;
StridesE,
StridesD);
return pass;
}; };
if(data_type == ContractionDataType::F32_F32_F32_F32 && if(data_type == ContractionDataType::F32_F32_F32_F32)
layout == ContractionMatrixLayout::MK_KN_MN_MN)
{
return profile(Row{}, Row{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{
return profile(Row{}, Col{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F32{}); if(compute_data_type == ContractionComputeDataType::F32)
} {
else if(data_type == ContractionDataType::F32_F32_F32_F32 && return run_profile_for_datatype(F32{}, F32{});
layout == ContractionMatrixLayout::KM_NK_MN_MN) }
{ else if(compute_data_type == ContractionComputeDataType::F16)
return profile(Col{}, Col{}, Row{}, F32{}); {
} return run_profile_for_datatype(F32{}, F16{});
else if(data_type == ContractionDataType::F64_F64_F64_F64 && }
layout == ContractionMatrixLayout::MK_KN_MN_MN) else if(compute_data_type == ContractionComputeDataType::BF16)
{ {
return profile(Row{}, Row{}, Row{}, F64{}); return run_profile_for_datatype(F32{}, BF16{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else
layout == ContractionMatrixLayout::MK_NK_MN_MN) {
{ std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return profile(Row{}, Col{}, Row{}, F64{}); return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F64_F64_F64_F64)
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F64)
{
return run_profile_for_datatype(F64{}, F64{});
}
else if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F64{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F16_F16_F16_F16)
layout == ContractionMatrixLayout::KM_NK_MN_MN)
{ {
return profile(Col{}, Col{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else else if(data_type == ContractionDataType::BF16_BF16_BF16_BF16)
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; if(compute_data_type == ContractionComputeDataType::F32)
{
return 1; return run_profile_for_datatype(BF16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
return 1;
} }
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_scale); REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_scale);
...@@ -19,7 +19,8 @@ enum struct RearrangeOp ...@@ -19,7 +19,8 @@ enum struct RearrangeOp
enum struct ConvLayout enum struct ConvLayout
{ {
NHWC, // 0 GNHWC, // 0
NHWGC, // 1
}; };
enum struct DataType enum struct DataType
...@@ -42,7 +43,8 @@ static void print_helper_msg() ...@@ -42,7 +43,8 @@ static void print_helper_msg()
<< " 1: Input fp16, Weight fp16, Output fp16\n" << " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n" << " 2: Input bf16, Weight bf16, Output bf16\n"
<< " 3: Input int8, Weight int8, Output int8)\n" << " 3: Input int8, Weight int8, Output int8)\n"
<< "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n" << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Output[G * N * Ho * Wo, Y * X * C],\n"
<< " 1: Input[N, Hi, Wi, G, C], Output[N * Ho * Wo * G, Y * X * C])\n"
<< "arg4: verification (0: no, 1: yes)\n" << "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg6: print tensor value (0: no; 1: yes)\n"
...@@ -114,11 +116,9 @@ int profile_conv_tensor_rearrange(int argc, char* argv[]) ...@@ -114,11 +116,9 @@ int profile_conv_tensor_rearrange(int argc, char* argv[])
return pass ? 0 : 1; return pass ? 0 : 1;
}; };
// Image To Column
if(rearrange_op == RearrangeOp::ImageToColumn) if(rearrange_op == RearrangeOp::ImageToColumn)
{ {
// NHWC if(layout == ConvLayout::GNHWC)
if(layout == ConvLayout::NHWC)
{ {
if(num_dim_spatial == 1) if(num_dim_spatial == 1)
{ {
...@@ -178,11 +178,70 @@ int profile_conv_tensor_rearrange(int argc, char* argv[]) ...@@ -178,11 +178,70 @@ int profile_conv_tensor_rearrange(int argc, char* argv[])
} }
} }
} }
else if(layout == ConvLayout::NHWGC)
{
if(num_dim_spatial == 1)
{
if(data_type == DataType::F32_F32)
{
return profile(I1, NWGC{}, F32{}, F32{}, ImageToColumn{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I1, NWGC{}, F16{}, F16{}, ImageToColumn{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I1, NWGC{}, BF16{}, BF16{}, ImageToColumn{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I1, NWGC{}, INT8{}, INT8{}, ImageToColumn{});
}
}
else if(num_dim_spatial == 2)
{
if(data_type == DataType::F32_F32)
{
return profile(I2, NHWGC{}, F32{}, F32{}, ImageToColumn{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I2, NHWGC{}, F16{}, F16{}, ImageToColumn{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I2, NHWGC{}, BF16{}, BF16{}, ImageToColumn{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I2, NHWGC{}, INT8{}, INT8{}, ImageToColumn{});
}
}
else if(num_dim_spatial == 3)
{
if(data_type == DataType::F32_F32)
{
return profile(I3, NDHWGC{}, F32{}, F32{}, ImageToColumn{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I3, NDHWGC{}, F16{}, F16{}, ImageToColumn{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I3, NDHWGC{}, BF16{}, BF16{}, ImageToColumn{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I3, NDHWGC{}, INT8{}, INT8{}, ImageToColumn{});
}
}
}
} }
else if(rearrange_op == RearrangeOp::ColumnToImage) else if(rearrange_op == RearrangeOp::ColumnToImage)
{ {
// NHWC if(layout == ConvLayout::GNHWC)
if(layout == ConvLayout::NHWC)
{ {
if(num_dim_spatial == 1) if(num_dim_spatial == 1)
{ {
...@@ -242,6 +301,66 @@ int profile_conv_tensor_rearrange(int argc, char* argv[]) ...@@ -242,6 +301,66 @@ int profile_conv_tensor_rearrange(int argc, char* argv[])
} }
} }
} }
else if(layout == ConvLayout::NHWGC)
{
if(num_dim_spatial == 1)
{
if(data_type == DataType::F32_F32)
{
return profile(I1, NWGC{}, F32{}, F32{}, ColumnToImage{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I1, NWGC{}, F16{}, F16{}, ColumnToImage{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I1, NWGC{}, BF16{}, BF16{}, ColumnToImage{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I1, NWGC{}, INT8{}, INT8{}, ColumnToImage{});
}
}
else if(num_dim_spatial == 2)
{
if(data_type == DataType::F32_F32)
{
return profile(I2, NHWGC{}, F32{}, F32{}, ColumnToImage{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I2, NHWGC{}, F16{}, F16{}, ColumnToImage{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I2, NHWGC{}, BF16{}, BF16{}, ColumnToImage{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I2, NHWGC{}, INT8{}, INT8{}, ColumnToImage{});
}
}
else if(num_dim_spatial == 3)
{
if(data_type == DataType::F32_F32)
{
return profile(I3, NDHWGC{}, F32{}, F32{}, ColumnToImage{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I3, NDHWGC{}, F16{}, F16{}, ColumnToImage{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I3, NDHWGC{}, BF16{}, BF16{}, ColumnToImage{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I3, NDHWGC{}, INT8{}, INT8{}, ColumnToImage{});
}
}
}
} }
std::cout << "this data_type & layout is not implemented" << std::endl; std::cout << "this data_type & layout is not implemented" << std::endl;
......
...@@ -25,6 +25,7 @@ enum struct GemmDataType ...@@ -25,6 +25,7 @@ enum struct GemmDataType
INT8_INT8_INT8, // 3 INT8_INT8_INT8, // 3
F8_F16_F16, // 4 F8_F16_F16, // 4
F16_F8_F16, // 5 F16_F8_F16, // 5
F16_F16_F16_F8, // 6
}; };
#define OP_NAME "gemm_splitk" #define OP_NAME "gemm_splitk"
...@@ -35,7 +36,8 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -35,7 +36,8 @@ int profile_gemm_splitk(int argc, char* argv[])
if(argc != 15) if(argc != 15)
{ {
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8)\n"); printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, "
"comp f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
...@@ -80,7 +82,8 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -80,7 +82,8 @@ int profile_gemm_splitk(int argc, char* argv[])
auto c_type, auto c_type,
auto a_layout, auto a_layout,
auto b_layout, auto b_layout,
auto c_layout) { auto c_layout,
auto compute_type) {
using ADataType = decltype(a_type); using ADataType = decltype(a_type);
using BDataType = decltype(b_type); using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type); using AccDataType = decltype(acc_type);
...@@ -90,6 +93,8 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -90,6 +93,8 @@ int profile_gemm_splitk(int argc, char* argv[])
using BLayout = decltype(b_layout); using BLayout = decltype(b_layout);
using CLayout = decltype(c_layout); using CLayout = decltype(c_layout);
using ComputeType = decltype(compute_type);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M; const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K; const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M; const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
...@@ -100,7 +105,8 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -100,7 +105,8 @@ int profile_gemm_splitk(int argc, char* argv[])
CDataType, CDataType,
ALayout, ALayout,
BLayout, BLayout,
CLayout>( CLayout,
ComputeType>(
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
...@@ -118,68 +124,84 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -118,68 +124,84 @@ int profile_gemm_splitk(int argc, char* argv[])
if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{}); return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{}, F32{});
} }
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{}); return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{}, F32{});
} }
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{}); return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{}, F32{});
} }
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{}); return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{}, F32{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}, F16{});
} }
#if defined CK_ENABLE_FP8 #if defined CK_ENABLE_FP8
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{}); return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}); return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{}); return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Row{}, Row{}); return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Row{}, Row{}, F16{});
} }
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Col{}, Row{}); return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Col{}, Row{}, F16{});
}
else if(data_type == GemmDataType::F16_F16_F16_F8 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}, F8{});
}
else if(data_type == GemmDataType::F16_F16_F16_F8 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}, F8{});
}
else if(data_type == GemmDataType::F16_F16_F16_F8 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{}, F8{});
}
else if(data_type == GemmDataType::F16_F16_F16_F8 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}, F8{});
} }
#endif #endif
else else
......
...@@ -20,10 +20,11 @@ enum struct ConvLayout ...@@ -20,10 +20,11 @@ enum struct ConvLayout
enum struct ConvDataType enum struct ConvDataType
{ {
F32_F32_F32, // 0 F32_F32_F32, // 0
F16_F16_F16, // 1 F16_F16_F16, // 1
BF16_F32_BF16, // 2 BF16_F32_BF16, // 2
F16_F16_F16_BF8_F8 // 3 F16_F16_F16_BF8_F8, // 3
I8_I8_I8 // 4
}; };
#define OP_NAME "grouped_conv_bwd_weight" #define OP_NAME "grouped_conv_bwd_weight"
...@@ -35,7 +36,8 @@ static void print_helper_msg() ...@@ -35,7 +36,8 @@ static void print_helper_msg()
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n" << " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight fp32, Output bf16\n" << " 2: Input bf16, Weight fp32, Output bf16\n"
<< " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8)\n" << " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n"
<< " 4: Input int8, Weight int8, Output int8)\n"
<< "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, " << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
"N, K, Ho, Wo]\n" "N, K, Ho, Wo]\n"
<< " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, " << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
...@@ -84,12 +86,8 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -84,12 +86,8 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
#ifdef CK_ENABLE_FP8 using F8 = ck::f8_t;
using F8 = ck::f8_t; using BF8 = ck::bf8_t;
#endif
#ifdef CK_ENABLE_BF8
using BF8 = ck::bf8_t;
#endif
using namespace ck::tensor_layout::convolution; using namespace ck::tensor_layout::convolution;
...@@ -139,83 +137,93 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -139,83 +137,93 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{ {
return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
} }
else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
} }
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
} }
else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
else if(data_type == ConvDataType::I8_I8_I8)
{
return profile(
I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
}
} }
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
else if(data_type == ConvDataType::F16_F16_F16_BF8_F8) if(data_type == ConvDataType::F16_F16_F16_BF8_F8)
{ {
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{}); return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{});
} }
else if(data_type == ConvDataType::I8_I8_I8)
{
return profile(
I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
}
} }
std::cout << "this data_type & layout is not implemented" << std::endl; std::cout << "this data_type & layout is not implemented" << std::endl;
......
...@@ -27,6 +27,8 @@ enum struct GemmDataType ...@@ -27,6 +27,8 @@ enum struct GemmDataType
F16_F16_F16, // 1 F16_F16_F16, // 1
BF16_BF16_BF16, // 2 BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3 INT8_INT8_INT8, // 3
F8_F16_F16, // 4
F16_F8_F16, // 5
}; };
#define OP_NAME "grouped_gemm" #define OP_NAME "grouped_gemm"
...@@ -56,7 +58,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -56,7 +58,7 @@ int profile_grouped_gemm(int argc, char* argv[])
{ {
std::cout std::cout
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n" << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: fp8@fp6; 5: f16@f8)\n"
<< "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n" << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
<< " 1: A[m, k] * B[n, k] = C[m, n];\n" << " 1: A[m, k] * B[n, k] = C[m, n];\n"
<< " 2: A[k, m] * B[k, n] = C[m, n];\n" << " 2: A[k, m] * B[k, n] = C[m, n];\n"
...@@ -169,6 +171,46 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -169,6 +171,46 @@ int profile_grouped_gemm(int argc, char* argv[])
StrideCs, StrideCs,
kbatch); kbatch);
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::f8_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch);
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::f8_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch);
}
else else
{ {
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <unordered_map> #include <unordered_map>
#include "profiler/data_type_enum.hpp" #include "profiler/data_type_enum.hpp"
#include "profiler/profile_groupnorm_impl.hpp" #include "profiler/profile_groupnorm_fwd_impl.hpp"
#include "profiler_operation_registry.hpp" #include "profiler_operation_registry.hpp"
using ck::index_t; using ck::index_t;
...@@ -93,12 +93,12 @@ int profile_groupnorm(int argc, char* argv[]) ...@@ -93,12 +93,12 @@ int profile_groupnorm(int argc, char* argv[])
if(data_type == ck::DataTypeEnum::Float) if(data_type == ck::DataTypeEnum::Float)
{ {
ck::profiler::profile_groupnorm_impl<F32, F32, F32, F32, F32>( ck::profiler::profile_groupnorm_impl<F32, F32, F32, F32, F32, F32, false>(
do_verification, init_method, do_log, time_kernel, length); do_verification, init_method, do_log, time_kernel, length);
} }
else if(data_type == ck::DataTypeEnum::Half) else if(data_type == ck::DataTypeEnum::Half)
{ {
ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16>( ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16, F32, false>(
do_verification, init_method, do_log, time_kernel, length); do_verification, init_method, do_log, time_kernel, length);
} }
else else
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <unordered_map> #include <unordered_map>
#include "profiler/data_type_enum.hpp" #include "profiler/data_type_enum.hpp"
#include "profiler/profile_layernorm_impl.hpp" #include "profiler/profile_layernorm_fwd_impl.hpp"
#include "profiler_operation_registry.hpp" #include "profiler_operation_registry.hpp"
using ck::index_t; using ck::index_t;
...@@ -76,19 +76,46 @@ int profile_layernorm(int argc, char* argv[]) ...@@ -76,19 +76,46 @@ int profile_layernorm(int argc, char* argv[])
arg_parser(argc, argv); arg_parser(argc, argv);
const std::vector<index_t> length = arg_parser.long_opts["length"]; const std::vector<index_t> length = arg_parser.long_opts["length"];
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
constexpr int rank = 2;
if(data_type == ck::DataTypeEnum::Half) if(length.size() == 2)
{ {
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>( constexpr int rank = 2;
do_verification, init_method, do_log, time_kernel, length);
if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F32, false, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, F32, false, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else
{
throw std::runtime_error("not implemented yet");
}
} }
else if(data_type == ck::DataTypeEnum::Float) else if(length.size() == 4)
{ {
ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>( constexpr int rank = 4;
do_verification, init_method, do_log, time_kernel, length);
if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F32, false, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, F32, false, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else
{
throw std::runtime_error("not implemented yet");
}
} }
else else
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_transpose_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct MatrixLayout
{
NCDHW, // 0
NCHWD, // 1
};
enum struct DataType
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F16_F16, // 1
};
#define OP_NAME "transpose"
#define OP_DESC "Transpose"
int profile_transpose(int argc, char* argv[])
{
if(argc != 15)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
// printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: N, C, D, H, W\n");
exit(1);
}
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
// const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
std::vector<index_t> lengths = std::stoi(argv[7]);
/**const int N = std::stoi(argv[7]);
const int C = std::stoi(argv[8]);
const int D = std::stoi(argv[9]);
const int H = std::stoi(argv[10]);
const int W = std::stoi(argv[11]);**/
using F32 = float;
using F16 = ck::half_t;
auto profile = [&](auto a_type, auto b_type) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
do_verification, init_method, do_log, time_kernel, lengths);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F32_F32_F32_F32_F32)
{
return profile(F32{}, F32{});
}
else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
{
return profile(F16{}, F16{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose);
...@@ -8,8 +8,7 @@ MY_PROJECT_SOURCE=$1 ...@@ -8,8 +8,7 @@ MY_PROJECT_SOURCE=$1
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker \ -D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
-save-temps=$PWD" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=ON \ -D BUILD_DEV=ON \
-D GPU_TARGETS="gfx908;gfx90a;gfx940" \ -D GPU_TARGETS="gfx908;gfx90a;gfx940" \
......
SECTIONS {
.hipFatBinSegment : { *(.hipFatBinSegment) }
} INSERT AFTER .bss
SECTIONS {
.hip_fatbin : { *(.hip_fatbin) }
} INSERT AFTER .hipFatBinSegment
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment