Commit 289f15de authored by aska-0096's avatar aska-0096
Browse files

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into wmma_gemm

parents 9bd44685 d58b7f51
......@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
......@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification,
{
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
// run reference
if(do_verification)
{
Tensor<AccDataType> c_m_n(HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
Tensor<AccDataType> c_m_n({M, N});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
......@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
{
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass &&
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
}
}
else
......
......@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
......@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification,
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification,
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass =
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
if(do_log)
{
......
......@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
......@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification,
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce0_m_host_result({M});
Tensor<ReduceDataType> reduce1_m_host_result({M});
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce0_m_device_result({M});
Tensor<ReduceDataType> reduce1_m_device_result({M});
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
......@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log)
{
......
......@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
......@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification,
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification,
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass =
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
if(do_log)
{
......
......@@ -3,9 +3,10 @@
#pragma once
#include "ck/ck.hpp"
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <typeinfo>
#include "ck/ck.hpp"
......@@ -13,7 +14,7 @@
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -26,32 +27,6 @@
namespace ck {
namespace profiler {
template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
......@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType>
bool profile_conv_bwd_weight_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
ck::index_t split_k)
bool profile_grouped_conv_bwd_weight_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
ck::index_t split_k)
{
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
......@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification,
if(do_verification)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result,
output,
......@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
using DeviceOp = ck::tensor_operation::device::DeviceConvBwdWeight<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
......@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification,
// profile device Conv instances
bool all_pass = true;
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
range_copy(conv_param.input_left_pads_, begin(input_left_pads));
range_copy(conv_param.input_right_pads_, begin(input_right_pads));
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.C_,
conv_param.input_spatial_lengths_,
conv_param.filter_spatial_lengths_,
conv_param.output_spatial_lengths_,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op,
......@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{
wei_device_buf.FromDevice(weight_device_result.mData.data());
bool pass =
ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
if(!pass)
{
std::cout << "Fail info:" << op_ptr->GetTypeString() << std::endl;
std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
}
all_pass &= pass;
if(do_log)
{
std::cout << "in : ";
show_data_nhwc_layout(output);
std::cout << std::endl;
std::cout << "wei: ";
show_data_nhwc_layout(weight_host_result);
std::cout << std::endl;
std::cout << "out : ";
show_data_nhwc_layout(input);
std::cout << std::endl;
std::cout << "wei_device: ";
show_data_nhwc_layout(weight_device_result);
std::cout << std::endl;
LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
;
LogRangeAsType<float>(
std::cout << "weight (device): ", weight_device_result.mData, ",")
<< std::endl;
;
LogRangeAsType<float>(
std::cout << "weight (host): ", weight_host_result.mData, ",")
<< std::endl;
;
LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
;
}
}
}
......
......@@ -9,14 +9,11 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
......@@ -69,7 +66,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
......@@ -182,7 +179,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
{
out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log)
{
......@@ -201,92 +198,48 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
}
};
// xdl
{
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
run_impl(op_ptr, argument_ptr);
}
}
// dl
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwd<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
run_impl(op_ptr, argument_ptr);
}
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
{},
{},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
run_impl(op_ptr, argument_ptr);
}
std::cout << "Best configuration parameters:"
......
......@@ -17,6 +17,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
......@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification,
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op);
ref_invoker.Run(ref_argument);
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
c_m_n_host_result.mData);
pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
if(do_log)
{
......
......@@ -165,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification,
{
y_dev.FromDevice(y.mData.data());
bool pass =
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log)
{
......
......@@ -6,8 +6,9 @@
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_common_util.hpp"
......@@ -359,10 +360,10 @@ bool profile_reduce_impl_impl(bool do_verification,
std::array<index_t, NumOutDim> arrOutLengths;
std::array<index_t, NumOutDim> arrOutStrides;
std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin());
std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin());
std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin());
std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin());
ck::ranges::copy(inLengths, arrInLengths.begin());
ck::ranges::copy(inStrides, arrInStrides.begin());
ck::ranges::copy(outLengths, arrOutLengths.begin());
ck::ranges::copy(outStrides, arrOutStrides.begin());
for(auto& reduce_ptr : reduce_ptrs)
{
......@@ -411,13 +412,12 @@ bool profile_reduce_impl_impl(bool do_verification,
bool single_pass;
out_dev.FromDevice(out.mData.data());
single_pass = ck::utils::check_err(out.mData, out_ref.mData);
single_pass = ck::utils::check_err(out, out_ref);
if(OutputIndex)
{
out_indices_dev.FromDevice(out_indices.mData.data());
single_pass = single_pass &&
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
};
if(!single_pass)
......@@ -492,7 +492,7 @@ bool profile_reduce_impl(bool do_verification,
std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
ck::ranges::copy(reduceDims, arrReduceDims.begin());
pass = pass && profile_reduce_impl_impl<InDataType,
AccDataType,
......
# ckProfiler
set(PROFILER_SOURCES
profiler.cpp
profile_gemm.cpp
profile_gemm_splitk.cpp
profile_gemm_bilinear.cpp
profile_gemm_bias_add_reduce.cpp
profile_gemm_add_add_fastgelu.cpp
profile_gemm_reduce.cpp
profile_batched_gemm.cpp
profile_batched_gemm_gemm.cpp
profile_batched_gemm_add_relu_gemm_add.cpp
profile_batched_gemm_reduce.cpp
profile_grouped_gemm.cpp
profile_conv_fwd.cpp
profile_conv_fwd_bias_relu.cpp
profile_conv_fwd_bias_relu_add.cpp
profile_conv_bwd_data.cpp
profile_grouped_conv_fwd.cpp
profile_grouped_conv_bwd_weight.cpp
profile_reduce.cpp
profile_groupnorm.cpp
profile_layernorm.cpp
profile_softmax.cpp
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
)
set(PROFILER_EXECUTABLE ckProfiler)
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
......@@ -7,7 +7,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_impl.hpp"
#include "profiler/profile_batched_gemm_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
......@@ -25,12 +26,15 @@ enum struct GemmDataType
INT8_INT8_INT8, // 3
};
#define OP_NAME "batched_gemm"
#define OP_DESC "Batched GEMM"
int profile_batched_gemm(int argc, char* argv[])
{
if(argc != 18)
{
// clang-format off
printf("arg1: tensor operation (batched_gemm: Batched GEMM)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n");
printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
printf(" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
......@@ -195,3 +199,5 @@ int profile_batched_gemm(int argc, char* argv[])
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm);
......@@ -6,7 +6,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
#include "profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
#include "profiler_operation_registry.hpp"
using F16 = ck::half_t;
using F32 = float;
......@@ -14,6 +15,9 @@ using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
#define OP_NAME "batched_gemm_add_relu_gemm_add"
#define OP_DESC "Batched GEMM+Add+Relu+GEMM+Add"
int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
{
enum struct GemmMatrixLayout
......@@ -109,8 +113,7 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
}
else
{
printf("arg1: tensor operation (batched_gemm_add_relu_gemm_add: "
"Batched_GEMM+Add+Relu+Gemm+Add)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (1: fp16)\n");
printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
"= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = "
......@@ -207,3 +210,5 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_add_relu_gemm_add);
......@@ -6,7 +6,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
#include "profiler/profile_batched_gemm_gemm_impl.hpp"
#include "profiler_operation_registry.hpp"
using F16 = ck::half_t;
using F32 = float;
......@@ -14,6 +15,9 @@ using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
#define OP_NAME "batched_gemm_gemm"
#define OP_DESC "Batched GEMM+GEMM"
int profile_batched_gemm_gemm(int argc, char* argv[])
{
enum struct GemmMatrixLayout
......@@ -101,7 +105,7 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
}
else
{
printf("arg1: tensor operation (batched_gemm_gemm: Batched_GEMM+Gemm)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (1: fp16)\n");
printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
"= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = E1[m, "
......@@ -179,3 +183,5 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_gemm);
......@@ -6,7 +6,11 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
#include "profiler/profile_batched_gemm_reduce_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "batched_gemm_reduce"
#define OP_DESC "Batched GEMM+Reduce"
int profile_batched_gemm_reduce(int argc, char* argv[])
{
......@@ -26,7 +30,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
if(argc != 15)
{
printf("arg1: tensor operation (batched_gemm_reduce: BatchedGEMM+Reduce)\n");
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
......@@ -151,3 +155,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_reduce);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <getopt.h>
#include "ck/library/utility/host_common_util.hpp"
#include "profiler/profile_batchnorm_backward_impl.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
using namespace std;
static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
{"reduceDims", required_argument, nullptr, 'R'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
class BatchnormBwdArgParser
{
private:
int option_index = 0;
public:
std::vector<size_t> inLengths;
std::vector<int> reduceDims;
bool do_verification = false;
bool do_dumpout = false;
bool haveSavedMeanInvVar;
int data_type = 0;
int init_method = 2;
bool time_kernel = false;
BatchnormBwdArgParser() = default;
~BatchnormBwdArgParser() = default;
void show_usage(const char* cmd)
{
// clang-format off
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl;
std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl;
// clang-format on
};
int operator()(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
optind++; // to skip the module name
while(1)
{
ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
{
case 'D':
if(!optarg)
throw std::runtime_error("Invalid option format!");
inLengths = getTypeValuesFromString<size_t>(optarg);
break;
case 'R':
if(!optarg)
throw std::runtime_error("Invalid option format!");
reduceDims = getTypeValuesFromString<int>(optarg);
break;
case 'v':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_verification = static_cast<bool>(std::atoi(optarg));
break;
case 'o':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_dumpout = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
return -1;
};
break;
default:
show_usage(argv[0]);
std::cerr << "Invalid cmd-line options!" << std::endl;
return -1;
};
};
if(optind + 4 > argc)
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
data_type = std::atoi(argv[optind++]);
haveSavedMeanInvVar = std::atoi(argv[optind++]);
init_method = std::atoi(argv[optind++]);
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
return -1;
return 0;
};
}; // end of class AppArgs
static const double epsilon = std::numeric_limits<float>::epsilon();
int profile_batchnorm_backward(int argc, char* argv[])
{
using ck::profiler::profile_batchnorm_backward_impl;
BatchnormBwdArgParser arg_parser;
if(arg_parser(argc, argv) != 0)
return -1;
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
using F64 = double;
if(arg_parser.data_type == 0)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<F16, F32, F32, F32, F16, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
else if(arg_parser.data_type == 1)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<F32, F32, F32, F32, F32, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
else if(arg_parser.data_type == 5)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<BF16, F32, F32, F32, BF16, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
else if(arg_parser.data_type == 6)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_backward_impl<F64, F64, F64, F64, F64, F64, F64, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.haveSavedMeanInvVar,
epsilon);
};
}
return 0;
}
REGISTER_PROFILER_OPERATION("bnorm_bwd", "Batchnorm backward", profile_batchnorm_backward);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <getopt.h>
#include "ck/library/utility/host_common_util.hpp"
#include "profiler/profile_batchnorm_forward_impl.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
using namespace std;
static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
{"reduceDims", required_argument, nullptr, 'R'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
class BatchnormFwdArgParser
{
private:
int option_index = 0;
public:
std::vector<size_t> inLengths;
std::vector<int> reduceDims;
bool do_verification = false;
bool do_dumpout = false;
bool updateMovingAverage;
bool saveMeanAndInvVariance;
int data_type = 0;
int init_method = 2;
bool time_kernel = false;
BatchnormFwdArgParser() = default;
~BatchnormFwdArgParser() = default;
void show_usage(const char* cmd)
{
// clang-format off
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)" << std::endl;
std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance (0=no, 1=yes)" << std::endl;
std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
// clang-format on
};
int operator()(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
optind++; // to skip the module name
while(1)
{
ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
{
case 'D':
if(!optarg)
throw std::runtime_error("Invalid option format!");
inLengths = getTypeValuesFromString<size_t>(optarg);
break;
case 'R':
if(!optarg)
throw std::runtime_error("Invalid option format!");
reduceDims = getTypeValuesFromString<int>(optarg);
break;
case 'v':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_verification = static_cast<bool>(std::atoi(optarg));
break;
case 'o':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_dumpout = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
return -1;
};
break;
default:
show_usage(argv[0]);
std::cerr << "Invalid cmd-line options!" << std::endl;
return -1;
};
};
if(optind + 5 > argc)
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
data_type = std::atoi(argv[optind++]);
updateMovingAverage = std::atoi(argv[optind++]);
saveMeanAndInvVariance = std::atoi(argv[optind++]);
init_method = std::atoi(argv[optind++]);
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
return -1;
return 0;
};
}; // end of class AppArgs
static const double epsilon = std::numeric_limits<float>::epsilon();
static const double averageFactor = 0.1;
int profile_batchnorm_forward(int argc, char* argv[])
{
using ck::profiler::profile_batchnorm_forward_impl;
BatchnormFwdArgParser arg_parser;
if(arg_parser(argc, argv) != 0)
return -1;
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
using F64 = double;
if(arg_parser.data_type == 0)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_forward_impl<F16, F16, F32, F16, F16, F16, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.updateMovingAverage,
arg_parser.saveMeanAndInvVariance,
epsilon,
averageFactor);
};
}
else if(arg_parser.data_type == 1)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_forward_impl<F32, F32, F32, F32, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.updateMovingAverage,
arg_parser.saveMeanAndInvVariance,
epsilon,
averageFactor);
};
}
else if(arg_parser.data_type == 5)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_forward_impl<BF16, BF16, F32, BF16, BF16, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.updateMovingAverage,
arg_parser.saveMeanAndInvVariance,
epsilon,
averageFactor);
};
}
else if(arg_parser.data_type == 6)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_forward_impl<F64, F64, F64, F64, F64, F64, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
arg_parser.updateMovingAverage,
arg_parser.saveMeanAndInvVariance,
epsilon,
averageFactor);
};
}
return 0;
}
REGISTER_PROFILER_OPERATION("bnorm_fwd", "Batchnorm forward", profile_batchnorm_forward);
......@@ -6,7 +6,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_bwd_data_impl.hpp"
#include "profiler/profile_conv_bwd_data_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace {
......@@ -24,10 +25,13 @@ enum struct ConvDataType
INT8_INT8_INT8, // 3
};
#define OP_NAME "conv_bwd_data"
#define OP_DESC "Convolution Backward Data"
static void print_helper_msg()
{
std::cout
<< "arg1: tensor operation (conv_bwd_data: Convolution Backward Data)\n"
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n"
......@@ -182,3 +186,5 @@ int profile_conv_bwd_data(int argc, char* argv[])
return 1;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_bwd_data);
......@@ -6,7 +6,8 @@
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_fwd_impl.hpp"
#include "profiler/profile_conv_fwd_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace {
......@@ -24,11 +25,14 @@ enum struct ConvDataType
INT8_INT8_INT8, // 3
};
#define OP_NAME "conv_fwd"
#define OP_DESC "Convolution Forward"
static void print_helper_msg()
{
std::cout
// clang-format-off
<< "arg1: tensor operation (conv_fwd: Convolution Forward)\n"
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n"
......@@ -184,3 +188,5 @@ int profile_conv_fwd(int argc, char* argv[])
return 1;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment