Unverified Commit 24af0144 authored by Po Yen Chen's avatar Po Yen Chen Committed by GitHub
Browse files

Merge branch 'develop' into gemm_layernorm_welford

parents 961f5e9e b79bbbc2
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result({M});
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result({M});
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification,
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -3,9 +3,10 @@ ...@@ -3,9 +3,10 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include <algorithm>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <iterator>
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
...@@ -13,7 +14,7 @@ ...@@ -13,7 +14,7 @@
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -26,32 +27,6 @@ ...@@ -26,32 +27,6 @@
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
std::cout << "]";
}
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InLayout, typename InLayout,
typename WeiLayout, typename WeiLayout,
...@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial, ...@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType> typename OutDataType>
bool profile_conv_bwd_weight_impl(int do_verification, bool profile_grouped_conv_bwd_weight_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
const ck::utils::conv::ConvParam& conv_param, const ck::utils::conv::ConvParam& conv_param,
ck::index_t split_k) ck::index_t split_k)
{ {
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
...@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>{}; OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input, auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result, weight_host_result,
output, output,
...@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
using DeviceOp = ck::tensor_operation::device::DeviceConvBwdWeight<NDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>; OutElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification,
// profile device Conv instances // profile device Conv instances
bool all_pass = true; bool all_pass = true;
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
range_copy(conv_param.input_left_pads_, begin(input_left_pads));
range_copy(conv_param.input_right_pads_, begin(input_right_pads));
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.G_,
conv_param.N_, conv_param.N_,
conv_param.K_, conv_param.K_,
conv_param.C_, conv_param.C_,
conv_param.input_spatial_lengths_, input_spatial_lengths,
conv_param.filter_spatial_lengths_, filter_spatial_lengths,
conv_param.output_spatial_lengths_, output_spatial_lengths,
conv_param.conv_filter_strides_, conv_filter_strides,
conv_param.conv_filter_dilations_, conv_filter_dilations,
conv_param.input_left_pads_, input_left_pads,
conv_param.input_right_pads_, input_right_pads,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op, out_element_op,
...@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weight_device_result.mData.data()); wei_device_buf.FromDevice(weight_device_result.mData.data());
bool pass = bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
if(!pass) if(!pass)
{ {
std::cout << "Fail info:" << op_ptr->GetTypeString() << std::endl; std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
} }
all_pass &= pass; all_pass &= pass;
if(do_log) if(do_log)
{ {
std::cout << "in : "; LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
show_data_nhwc_layout(output); ;
std::cout << std::endl; LogRangeAsType<float>(
std::cout << "weight (device): ", weight_device_result.mData, ",")
std::cout << "wei: "; << std::endl;
show_data_nhwc_layout(weight_host_result); ;
std::cout << std::endl; LogRangeAsType<float>(
std::cout << "weight (host): ", weight_host_result.mData, ",")
std::cout << "out : "; << std::endl;
show_data_nhwc_layout(input); ;
std::cout << std::endl; LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
;
std::cout << "wei_device: ";
show_data_nhwc_layout(weight_device_result);
std::cout << std::endl;
} }
} }
} }
......
...@@ -9,11 +9,12 @@ ...@@ -9,11 +9,12 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
...@@ -136,25 +137,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -136,25 +137,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name; std::string best_op_name;
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
...@@ -163,29 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -163,29 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
// profile device op instances // profile device op instances
bool pass = true; bool pass = true;
for(auto& op_ptr : op_ptrs) auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// re-init output to zero before profiling next kernel // re-init output to zero before profiling next kernel
...@@ -220,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -220,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
{ {
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log) if(do_log)
{ {
...@@ -237,6 +197,95 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -237,6 +197,95 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
{ {
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
} }
};
// xdl
{
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
{},
{},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
run_impl(op_ptr, argument_ptr);
}
}
// dl
{
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwd<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
run_impl(op_ptr, argument_ptr);
}
} }
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op); c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData, pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -75,14 +75,14 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -75,14 +75,14 @@ bool profile_groupnorm_impl(int do_verification,
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.mData.data());
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
PassThrough, PassThrough,
5, 5,
3>; 3>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
...@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(), beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(), y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{}); PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
...@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification,
{ {
y_dev.FromDevice(y.mData.data()); y_dev.FromDevice(y.mData.data());
bool pass = bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log) if(do_log)
{ {
...@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification,
if(num_kernel == 0) if(num_kernel == 0)
{ {
std::cout << "Error: No kernel is tested" << std::endl; std::cout << "Error: No kernel is applicable" << std::endl;
return false; return false;
} }
......
...@@ -6,9 +6,7 @@ ...@@ -6,9 +6,7 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -24,35 +22,36 @@ template <typename XDataType, ...@@ -24,35 +22,36 @@ template <typename XDataType,
typename AccDataType, typename AccDataType,
typename YDataType, typename YDataType,
index_t Rank> index_t Rank>
void profile_layernorm_impl(int do_verification, bool profile_layernorm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
std::vector<index_t> length, std::vector<index_t> length)
std::vector<index_t> strideXY,
std::vector<index_t> strideGamma,
std::vector<index_t> strideBeta)
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() < 2) if(length.size() < 2)
return; return false;
// Assume normalize dimension except for first dimension // Assume normalize dimension except for batch (first) dimension
std::vector<index_t> reduce_length{length.begin() + 1, length.end()}; std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
std::vector<index_t> reduce_dim; std::vector<index_t> reduce_dim;
for(int i = 1; i < Rank; ++i) for(int i = 1; i < Rank; ++i)
reduce_dim.push_back(i); reduce_dim.push_back(i);
Tensor<XDataType> x(length); Tensor<XDataType> x(length);
Tensor<GammaDataType> gamma(reduce_length, strideGamma); Tensor<GammaDataType> gamma(reduce_length);
Tensor<BetaDataType> beta(reduce_length, strideBeta); Tensor<BetaDataType> beta(reduce_length);
Tensor<YDataType> y(length, strideXY); Tensor<YDataType> y(length);
Tensor<YDataType> host_y(length, strideXY); Tensor<YDataType> host_y(length);
std::vector<index_t> strideXY =
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
std::vector<index_t> strideGammaBeta = strideXY;
strideGammaBeta[0] = 0;
switch(init_method) switch(init_method)
{ {
// case 0: break;
case 0: case 0:
x.GenerateTensorValue(GeneratorTensor_1<XDataType>{}); x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{}); gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
...@@ -84,14 +83,14 @@ void profile_layernorm_impl(int do_verification, ...@@ -84,14 +83,14 @@ void profile_layernorm_impl(int do_verification,
constexpr int NumReduceDim = Rank - 1; constexpr int NumReduceDim = Rank - 1;
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
...@@ -122,12 +121,14 @@ void profile_layernorm_impl(int do_verification, ...@@ -122,12 +121,14 @@ void profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer(length, auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
strideXY, strideXY,
strideGamma, strideGammaBeta,
strideBeta, strideGammaBeta,
strideXY, strideXY,
reduce_dim, reduce_dim,
1e-4, 1e-4,
...@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification, ...@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(), beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(), y_dev.GetDeviceBuffer(),
nullptr,
nullptr,
PassThrough{}); PassThrough{});
if(!inst_ptr->IsSupportedArgument(argument_ptr.get())) if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; ++num_kernel;
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl; }
else
{
if(time_kernel)
{
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
}
continue; continue;
} }
...@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification, ...@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification,
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " if(time_kernel)
<< inst_ptr->GetTypeString() << std::endl; std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time) if(avg_time < best_avg_time)
{ {
...@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification, ...@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification,
{ {
std::cout << inst_ptr->GetTypeString() << " failed verification: "; std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
return; return false;
} }
else else
{ {
std::cout << "pass" << std::endl; if(time_kernel)
std::cout << "pass" << std::endl;
} }
} }
} }
LogRange(std::cout << "length = ", length, ",") << ", "; if(time_kernel)
LogRange(std::cout << "stride = ", strideXY, ",") << ", "; {
LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl; LogRange(std::cout << "length = ", length, ",") << ", ";
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
<< best_instance_name << std::endl; LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true;
} }
} // namespace profiler } // namespace profiler
......
...@@ -18,57 +18,61 @@ namespace tensor_operation { ...@@ -18,57 +18,61 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex> template <index_t Rank,
index_t NumReduceDim,
ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool UseIndex>
struct ReduceDescription struct ReduceDescription
{ {
static constexpr int Rank_ = Rank; static constexpr index_t Rank_ = Rank;
static constexpr int NumReduceDim_ = NumReduceDim; static constexpr index_t NumReduceDim_ = NumReduceDim;
static constexpr int ReduceOpId_ = ReduceOpId; static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId;
static constexpr int PropagateNan_ = PropagateNan; static constexpr bool PropagateNan_ = PropagateNan;
static constexpr int UseIndex_ = UseIndex; static constexpr bool UseIndex_ = UseIndex;
}; };
using reduce_description_instances = using reduce_description_instances =
std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD std::tuple<ReduceDescription<4, 3, ReduceTensorOp::ADD, false, false>, // for ADD
ReduceDescription<4, 4, 0, false, false>, ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>,
ReduceDescription<4, 1, 0, false, false>, ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>,
ReduceDescription<2, 1, 0, false, false>, ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>,
ReduceDescription<4, 3, 5, false, false>, // for AVG ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG
ReduceDescription<4, 4, 5, false, false>, ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>,
ReduceDescription<4, 1, 5, false, false>, ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>,
ReduceDescription<2, 1, 5, false, false>, ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>,
ReduceDescription<4, 3, 7, false, false>, // for NORM2 ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2
ReduceDescription<4, 4, 7, false, false>, ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<4, 1, 7, false, false>, ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<2, 1, 7, false, false>, ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<4, 3, 2, false, false>, // for MIN ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN
ReduceDescription<4, 4, 2, false, false>, ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>,
ReduceDescription<4, 1, 2, false, false>, ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>,
ReduceDescription<2, 1, 2, false, false>, ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>,
ReduceDescription<4, 3, 3, false, false>, // for MAX ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX
ReduceDescription<4, 4, 3, false, false>, ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>,
ReduceDescription<4, 1, 3, false, false>, ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>,
ReduceDescription<2, 1, 3, false, false>, ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>,
ReduceDescription<4, 3, 4, false, false>, // for AMAX ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX
ReduceDescription<4, 4, 4, false, false>, ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<4, 1, 4, false, false>, ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<2, 1, 4, false, false>, ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<4, 3, 2, false, true>, // for MIN ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN
ReduceDescription<4, 4, 2, false, true>, ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>,
ReduceDescription<4, 1, 2, false, true>, ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>,
ReduceDescription<2, 1, 2, false, true>, ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>,
ReduceDescription<4, 3, 3, false, true>, // for MAX ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX
ReduceDescription<4, 4, 3, false, true>, ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>,
ReduceDescription<4, 1, 3, false, true>, ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>,
ReduceDescription<2, 1, 3, false, true>, ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>,
ReduceDescription<4, 3, 4, false, true>, // for AMAX ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX
ReduceDescription<4, 4, 4, false, true>, ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>,
ReduceDescription<4, 1, 4, false, true>, ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>,
ReduceDescription<2, 1, 4, false, true>>; ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>;
template <typename DescriptionType> template <typename DescriptionType>
bool description_match(const DescriptionType& description, bool description_match(const DescriptionType& description,
...@@ -78,9 +82,8 @@ bool description_match(const DescriptionType& description, ...@@ -78,9 +82,8 @@ bool description_match(const DescriptionType& description,
bool PropagateNan, bool PropagateNan,
bool UseIndex) bool UseIndex)
{ {
if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) || if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId ||
description.PropagateNan_ != static_cast<int>(PropagateNan) || description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex)
description.UseIndex_ != static_cast<int>(UseIndex))
return (false); return (false);
if(DescriptionType::NumReduceDim_ != reduceDims.size()) if(DescriptionType::NumReduceDim_ != reduceDims.size())
...@@ -99,11 +102,10 @@ bool description_match(const DescriptionType& description, ...@@ -99,11 +102,10 @@ bool description_match(const DescriptionType& description,
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <index_t Rank, index_t NumReduceDim> template <int Rank, int NumReduceDim>
static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims) static inline std::array<int, Rank - NumReduceDim>
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
{ {
assert(NumReduceDim == reduceDims.size());
int reduceFlag = 0; int reduceFlag = 0;
// flag the bits for the reduceDims // flag the bits for the reduceDims
...@@ -112,13 +114,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce ...@@ -112,13 +114,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
reduceFlag |= 1 << reduceDims[i]; reduceFlag |= 1 << reduceDims[i];
}; };
std::vector<int> invariantDims; std::array<int, Rank - NumReduceDim> invariantDims;
// collect invariant dimensions // collect invariant dimensions
int dim = 0;
for(int i = 0; i < Rank; i++) for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0) if((reduceFlag & (1 << i)) == 0)
{ {
invariantDims.push_back(i); invariantDims[dim] = i;
dim++;
}; };
return invariantDims; return invariantDims;
...@@ -137,7 +141,7 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -137,7 +141,7 @@ bool profile_reduce_impl_impl(bool do_verification,
bool do_dumpout, bool do_dumpout,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims, const std::array<int, NumReduceDim>& reduceDims,
float alpha, float alpha,
float beta) float beta)
{ {
...@@ -145,6 +149,8 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -145,6 +149,8 @@ bool profile_reduce_impl_impl(bool do_verification,
using namespace ck::tensor_operation::device::instance; using namespace ck::tensor_operation::device::instance;
using ck::host_common::dumpBufferToFile; using ck::host_common::dumpBufferToFile;
constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
constexpr bool op_support_indices = constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
...@@ -279,28 +285,32 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -279,28 +285,32 @@ bool profile_reduce_impl_impl(bool do_verification,
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator( reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length)); static_cast<int32_t>(reduce_total_length));
using DeviceReduceInstPtr0 = using DeviceReduceInstPtr =
DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>; DeviceReducePtr<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>;
std::vector<DeviceReduceInstPtr0> reduce0_ptrs; std::vector<DeviceReduceInstPtr> reduce_ptrs;
add_device_reduce_instance_threadwise<InDataType, add_device_reduce_instance_threadwise<InDataType,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
NumReduceDim, NumReduceDim,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan, PropagateNan,
UseIndex>(reduce0_ptrs); UseIndex>(reduce_ptrs);
add_device_reduce_instance_blockwise<InDataType, add_device_reduce_instance_blockwise<InDataType,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
NumReduceDim, NumReduceDim,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan, PropagateNan,
UseIndex>(reduce0_ptrs); UseIndex>(reduce_ptrs);
if constexpr(use_atomic_add) if constexpr(use_atomic_add)
{ {
...@@ -309,12 +319,14 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -309,12 +319,14 @@ bool profile_reduce_impl_impl(bool do_verification,
OutDataType, OutDataType,
Rank, Rank,
NumReduceDim, NumReduceDim,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan, PropagateNan,
UseIndex>(reduce0_ptrs); UseIndex>(reduce_ptrs);
} }
if(reduce0_ptrs.empty()) if(reduce_ptrs.empty())
{ {
throw std::runtime_error("Wrong! No device REDUCE instance found"); throw std::runtime_error("Wrong! No device REDUCE instance found");
}; };
...@@ -342,22 +354,22 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -342,22 +354,22 @@ bool profile_reduce_impl_impl(bool do_verification,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; std::array<index_t, Rank> arrInLengths;
std::vector<ck::index_t> i_inStrides; std::array<index_t, Rank> arrInStrides;
std::vector<ck::index_t> i_outLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, NumOutDim> arrOutStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end()); std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin());
i_inStrides.assign(inStrides.begin(), inStrides.end()); std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin());
for(auto& reduce_ptr : reduce0_ptrs) for(auto& reduce_ptr : reduce_ptrs)
{ {
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths,
i_inStrides, arrInStrides,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
...@@ -399,13 +411,12 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -399,13 +411,12 @@ bool profile_reduce_impl_impl(bool do_verification,
bool single_pass; bool single_pass;
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
single_pass = ck::utils::check_err(out.mData, out_ref.mData); single_pass = ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_indices_dev.FromDevice(out_indices.mData.data()); out_indices_dev.FromDevice(out_indices.mData.data());
single_pass = single_pass && single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
}; };
if(!single_pass) if(!single_pass)
...@@ -478,22 +489,25 @@ bool profile_reduce_impl(bool do_verification, ...@@ -478,22 +489,25 @@ bool profile_reduce_impl(bool do_verification,
descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex)) descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
return; return;
pass = pass && std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
profile_reduce_impl_impl<InDataType,
AccDataType, std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
OutDataType,
descType::Rank_, pass = pass && profile_reduce_impl_impl<InDataType,
descType::NumReduceDim_, AccDataType,
static_cast<ReduceTensorOp>(descType::ReduceOpId_), OutDataType,
static_cast<bool>(descType::PropagateNan_), descType::Rank_,
static_cast<bool>(descType::UseIndex_)>(do_verification, descType::NumReduceDim_,
init_method, static_cast<ReduceTensorOp>(descType::ReduceOpId_),
do_dumpout, descType::PropagateNan_,
time_kernel, descType::UseIndex_>(do_verification,
inLengths, init_method,
reduceDims, do_dumpout,
alpha, time_kernel,
beta); inLengths,
arrReduceDims,
alpha,
beta);
matched = true; matched = true;
}); });
......
...@@ -3,55 +3,27 @@ ...@@ -3,55 +3,27 @@
#pragma once #pragma once
#include <algorithm>
#include <iomanip> #include <iomanip>
#include <iostream>
#include <string>
#include <vector>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp" #include "ck/utility/data_type.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
namespace {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
} // namespace
void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck { namespace ck {
namespace profiler { namespace profiler {
enum struct NormType enum struct SoftmaxDataType
{
BATCHNORM,
SOFTMAX,
};
enum struct NormDataType
{ {
F32_F32, // in, out F32_F32, // in, out
F16_F16, F16_F16,
...@@ -60,7 +32,7 @@ enum struct NormDataType ...@@ -60,7 +32,7 @@ enum struct NormDataType
}; };
// clang-format off // clang-format off
template <typename NormDataType> std::string type_to_string(); template <typename SoftmaxDataType> std::string type_to_string();
template <> std::string type_to_string<float>() { return "f32"; } template <> std::string type_to_string<float>() { return "f32"; }
template <> std::string type_to_string<half_t>() { return "f16"; } template <> std::string type_to_string<half_t>() { return "f16"; }
template <> std::string type_to_string<bhalf_t>() { return "bf16"; } template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
...@@ -69,16 +41,15 @@ template <> std::string type_to_string<int32_t>() { return "int32"; } ...@@ -69,16 +41,15 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on // clang-format on
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank> template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
void profile_normalization_impl(int do_verification, bool profile_softmax_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
std::vector<index_t> in_length, std::vector<index_t> in_length,
std::vector<index_t> in_strides, std::vector<index_t> in_strides,
std::vector<index_t> reduce_dims, std::vector<index_t> reduce_dims,
AccDataType alpha, AccDataType alpha,
AccDataType beta, AccDataType beta)
NormType norm_type)
{ {
if(Rank != in_length.size()) if(Rank != in_length.size())
{ {
...@@ -88,62 +59,46 @@ void profile_normalization_impl(int do_verification, ...@@ -88,62 +59,46 @@ void profile_normalization_impl(int do_verification,
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length) Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
: Tensor<InDataType>(in_length, in_strides); : Tensor<InDataType>(in_length, in_strides);
Tensor<OutDataType> out(in.mDesc); Tensor<OutDataType> out(in.mDesc);
Tensor<OutDataType> prior_out(in.mDesc);
switch(init_method) switch(init_method)
{ {
// case 0: break; case 0: break;
case 0:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
break;
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in.begin(), in.end());
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<OutDataType>{-5.f, 5.f}(prior_out.begin(),
prior_out.end());
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); ck::utils::FillUniformDistribution<InDataType>{0.0f, 1.0f}(in);
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5}); ck::utils::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(prior_out);
} }
Tensor<OutDataType> out_ref(out); Tensor<OutDataType> out_ref(prior_out);
if(do_verification)
{
using ReferenceSoftmax =
tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
ReferenceSoftmax{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
}
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(in.GetElementSpaceSizeInBytes());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetElementSpaceSizeInBytes());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.data());
out_dev.ToDevice(out.mData.data());
std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end()); std::vector<index_t> in_tensor_lengths(in.GetLengths().begin(), in.GetLengths().end());
std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end()); std::vector<index_t> in_tensor_strides(in.GetStrides().begin(), in.GetStrides().end());
// add device softmax instances // add device softmax instances
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceOpPtr = tensor_operation::device:: using DeviceOp = tensor_operation::device::
DeviceSoftmaxPtr<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>; DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
std::vector<DeviceOpPtr> instances;
if(norm_type == NormType::SOFTMAX) // get device op instances
{ const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value && DeviceOp>::GetInstances();
is_same<AccDataType, float>::value) std::cout << "found " << instances.size() << " instances" << std::endl;
{
if constexpr(Rank == 3)
tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances(
instances);
else if constexpr(Rank == 4)
tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances(
instances);
}
else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
is_same<AccDataType, float>::value)
{
if constexpr(Rank == 3)
tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances(
instances);
else if constexpr(Rank == 4)
tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances(
instances);
}
}
if(instances.size() <= 0) if(instances.size() <= 0)
{ {
...@@ -153,21 +108,19 @@ void profile_normalization_impl(int do_verification, ...@@ -153,21 +108,19 @@ void profile_normalization_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
std::vector<bool> instance_pass;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
for(auto& inst_ptr : instances) for(auto& inst_ptr : instances)
{ {
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3 // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
// problem to rank 4 kernel) other than invoking IsSupportedArgument()? // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) && if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
{ {
continue; continue;
} }
auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths, auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
i_in_strides, in_tensor_strides,
reduce_dims, reduce_dims,
&alpha, &alpha,
&beta, &beta,
...@@ -181,45 +134,42 @@ void profile_normalization_impl(int do_verification, ...@@ -181,45 +134,42 @@ void profile_normalization_impl(int do_verification,
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = [", in_length, ", ") LogRange(std::cout << "input lengths = [", in_length, ", ")
<< "], " << "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl; << "scaler = [" << alpha << ", " << beta << "]";
return; LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl;
instance_pass.push_back(true);
continue;
} }
out_dev.ToDevice(prior_out.data());
auto invoker_ptr = inst_ptr->MakeInvokerPointer(); auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); if(time_kernel)
{
std::size_t num_bytes = std::size_t num_bytes =
in.mDesc.GetElementSize() * sizeof(InDataType) + in.GetElementSize() * sizeof(InDataType) +
(beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType); (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl; << inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time) if(avg_time < best_avg_time)
{ {
best_instance_name = inst_ptr->GetTypeString(); best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time; best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec; best_gb_per_sec = gb_per_sec;
}
} }
if(do_verification) if(do_verification)
{ {
// TODO: factory method to dynamically switch between different reference normalizations out_dev.FromDevice(out.data());
using ReferenceFactory = bool pass = true;
tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
out_dev.FromDevice(out.mData.data());
bool pass;
if(std::is_same<InDataType, int8_t>::value) if(std::is_same<InDataType, int8_t>::value)
{ {
pass = ck::utils::check_err( pass = pass && ck::utils::check_err(
out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1); out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
if(do_log) if(do_log)
{ {
LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl;
...@@ -230,7 +180,7 @@ void profile_normalization_impl(int do_verification, ...@@ -230,7 +180,7 @@ void profile_normalization_impl(int do_verification,
} }
else else
{ {
pass = ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
...@@ -247,16 +197,22 @@ void profile_normalization_impl(int do_verification, ...@@ -247,16 +197,22 @@ void profile_normalization_impl(int do_verification,
<< "], " << "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl; << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
} }
instance_pass.push_back(pass);
} }
} }
std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_" if(time_kernel)
<< type_to_string<OutDataType>() << ", "; {
LogRange(std::cout << "length = ", i_in_lengths, ",") << ", "; std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
LogRange(std::cout << "stride = ", i_in_strides, ",") << ", "; << type_to_string<OutDataType>() << ", ";
LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", "; LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", ";
std::cout << "alpha = " << alpha << ", " LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", ";
<< "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
<< " GB/s, " << best_instance_name << std::endl; std::cout << "alpha = " << alpha << ", "
<< "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_instance_name << std::endl;
}
return std::all_of(
std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
} }
} // namespace profiler } // namespace profiler
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_bwd_weight_impl.hpp" #include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
namespace { namespace {
enum struct ConvLayout enum struct ConvLayout
{ {
NCHW_KCYX_NKHW, // 0 GNCHW_GKCYX_GNKHW, // 0
NHWC_KYXC_NHWK, // 1 GNHWC_GKYXC_GNHWK, // 1
}; };
enum struct ConvDataType enum struct ConvDataType
...@@ -25,24 +25,25 @@ enum struct ConvDataType ...@@ -25,24 +25,25 @@ enum struct ConvDataType
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
<< "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n" << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" << " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n" << " 2: Input bf16, Weight fp32, Output bf16)\n"
<< " 2: Input bf16, Weight fp32, Output bf16)\n" << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
<< "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n" "N, K, Ho, Wo]\n"
<< " 1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, K]\n" << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
<< "arg4: verification (0: no, 1: yes)\n" "N, Ho, Wo, K]\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" << "arg4: verification (0: no, 1: yes)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg6: print tensor value (0: no; 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n" << "arg7: time kernel (0: no, 1: yes)\n"
<< std::endl; << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
<< std::endl;
} }
} // namespace } // namespace
int profile_conv_bwd_weight(int argc, char* argv[]) int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{ {
// 8 for control, 1 for num_dim_spatial // 8 for control, 1 for num_dim_spatial
if(argc < 9) if(argc < 9)
...@@ -75,17 +76,17 @@ int profile_conv_bwd_weight(int argc, char* argv[]) ...@@ -75,17 +76,17 @@ int profile_conv_bwd_weight(int argc, char* argv[])
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using NWC = ck::tensor_layout::convolution::NWC; using GNWC = ck::tensor_layout::convolution::GNWC;
using NHWC = ck::tensor_layout::convolution::NHWC; using GNHWC = ck::tensor_layout::convolution::GNHWC;
using NDHWC = ck::tensor_layout::convolution::NDHWC; using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
using KXC = ck::tensor_layout::convolution::KXC; using GKXC = ck::tensor_layout::convolution::GKXC;
using KYXC = ck::tensor_layout::convolution::KYXC; using GKYXC = ck::tensor_layout::convolution::GKYXC;
using KZYXC = ck::tensor_layout::convolution::KZYXC; using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
using NWK = ck::tensor_layout::convolution::NWK; using GNWK = ck::tensor_layout::convolution::GNWK;
using NHWK = ck::tensor_layout::convolution::NHWK; using GNHWK = ck::tensor_layout::convolution::GNHWK;
using NDHWK = ck::tensor_layout::convolution::NDHWK; using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
constexpr auto I1 = ck::Number<1>{}; constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{}; constexpr auto I2 = ck::Number<2>{};
...@@ -108,64 +109,64 @@ int profile_conv_bwd_weight(int argc, char* argv[]) ...@@ -108,64 +109,64 @@ int profile_conv_bwd_weight(int argc, char* argv[])
using WeiDataType = decltype(wei_type); using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type); using OutDataType = decltype(out_type);
bool pass = ck::profiler::profile_conv_bwd_weight_impl<NDimSpatial, bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType>( OutDataType>(
do_verification, init_method, do_log, time_kernel, params, split_k); do_verification, init_method, do_log, time_kernel, params, split_k);
return pass ? 0 : 1; return pass ? 0 : 1;
}; };
if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK) if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, F32{}, BF16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{});
} }
} }
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK) else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, F32{}, BF16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{});
} }
} }
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK) else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, F32{}, BF16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
} }
} }
......
...@@ -12,8 +12,7 @@ using ck::index_t; ...@@ -12,8 +12,7 @@ using ck::index_t;
struct LayernormArgParser struct LayernormArgParser
{ {
std::unordered_map<std::string, std::vector<int>> long_opts = { std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
{"length", {}}, {"strideXY", {}}, {"strideGamma", {}}, {"strideBeta", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i) bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{ {
...@@ -52,9 +51,6 @@ void print_help_layernorm() ...@@ -52,9 +51,6 @@ void print_help_layernorm()
<< "arg4: print tensor value (0: no; 1: yes)\n" << "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n" << "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 1024 1024) \n" << "--length: tensor extents (e.g, --length 1024 1024) \n"
<< "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n"
<< "--strideGamma: tensor strides (e.g, --strideGamma 1)\n"
<< "--strideBeta: tensor strides (e.g, --strideBeta 1)\n"
<< std::endl; << std::endl;
} }
...@@ -77,10 +73,7 @@ int profile_layernorm(int argc, char* argv[]) ...@@ -77,10 +73,7 @@ int profile_layernorm(int argc, char* argv[])
// parse the long options // parse the long options
arg_parser(argc, argv); arg_parser(argc, argv);
const std::vector<index_t> length = arg_parser.long_opts["length"]; const std::vector<index_t> length = arg_parser.long_opts["length"];
const std::vector<index_t> strideXY = arg_parser.long_opts["strideXY"];
const std::vector<index_t> strideGamma = arg_parser.long_opts["strideGamma"];
const std::vector<index_t> strideBeta = arg_parser.long_opts["strideBeta"];
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
...@@ -88,25 +81,13 @@ int profile_layernorm(int argc, char* argv[]) ...@@ -88,25 +81,13 @@ int profile_layernorm(int argc, char* argv[])
if(data_type == ck::DataTypeEnum::Half) if(data_type == ck::DataTypeEnum::Half)
{ {
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(do_verification, ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(
init_method, do_verification, init_method, do_log, time_kernel, length);
do_log,
time_kernel,
length,
strideXY,
strideGamma,
strideBeta);
} }
else if(data_type == ck::DataTypeEnum::Float) else if(data_type == ck::DataTypeEnum::Float)
{ {
ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(do_verification, ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(
init_method, do_verification, init_method, do_log, time_kernel, length);
do_log,
time_kernel,
length,
strideXY,
strideGamma,
strideBeta);
} }
else else
{ {
......
...@@ -5,17 +5,13 @@ ...@@ -5,17 +5,13 @@
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include "profiler/include/profile_normalization_impl.hpp" #include "profiler/include/profile_softmax_impl.hpp"
using ck::index_t; using ck::index_t;
using ck::profiler::NormDataType; using ck::profiler::SoftmaxDataType;
using ck::profiler::NormType;
struct ArgParser struct ArgParser
{ {
std::unordered_map<std::string, NormType> norm_dict = {{"batchnorm", NormType::BATCHNORM},
{"softmax", NormType::SOFTMAX}};
std::unordered_map<std::string, std::vector<int>> long_opts = { std::unordered_map<std::string, std::vector<int>> long_opts = {
{"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}}; {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
...@@ -50,7 +46,7 @@ struct ArgParser ...@@ -50,7 +46,7 @@ struct ArgParser
void print_help() void print_help()
{ {
std::cout << "arg1: tensor operation (batchnorm/softmax)\n" std::cout << "arg1: tensor operation (softmax)\n"
<< "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n" << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
<< "arg3: verification (0: no; 1: yes)\n" << "arg3: verification (0: no; 1: yes)\n"
<< "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
...@@ -64,7 +60,7 @@ void print_help() ...@@ -64,7 +60,7 @@ void print_help()
<< std::endl; << std::endl;
} }
int profile_normalization(int argc, char* argv[]) int profile_softmax(int argc, char* argv[])
{ {
if(argc <= 2) if(argc <= 2)
{ {
...@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[]) ...@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[])
ArgParser arg_parser; ArgParser arg_parser;
// short unnamed options // short unnamed options
const NormType norm_type = arg_parser.norm_dict[argv[1]]; const SoftmaxDataType data_type = static_cast<SoftmaxDataType>(std::stoi(argv[2]));
const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2])); const bool do_verification = std::stoi(argv[3]);
const bool do_verification = std::stoi(argv[3]); const int init_method = std::stoi(argv[4]);
const int init_method = std::stoi(argv[4]); const bool do_log = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[5]); const bool time_kernel = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[6]);
// parse the long options // parse the long options
arg_parser(argc, argv); arg_parser(argc, argv);
...@@ -91,68 +86,64 @@ int profile_normalization(int argc, char* argv[]) ...@@ -91,68 +86,64 @@ int profile_normalization(int argc, char* argv[])
arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0]; arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0]; const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
// Rank 3
if(length.size() == 3) if(length.size() == 3)
{ {
if(data_type == NormDataType::F16_F16) if(data_type == SoftmaxDataType::F16_F16)
{ {
ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 3>( ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
do_verification, init_method,
init_method, do_log,
do_log, time_kernel,
time_kernel, length,
length, stride,
stride, reduce,
reduce, float(alpha),
float(alpha), float(beta));
float(beta),
norm_type);
} }
else if(data_type == NormDataType::F32_F32) else if(data_type == SoftmaxDataType::F32_F32)
{ {
ck::profiler::profile_normalization_impl<float, float, float, 3>(do_verification, ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
init_method, init_method,
do_log, do_log,
time_kernel, time_kernel,
length, length,
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta));
norm_type);
} }
else else
{ {
throw std::runtime_error("not implemented yet"); throw std::runtime_error("not implemented yet");
} }
} }
// Rank 4
else if(length.size() == 4) else if(length.size() == 4)
{ {
if(data_type == NormDataType::F16_F16) if(data_type == SoftmaxDataType::F16_F16)
{ {
ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 4>( ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
do_verification, init_method,
init_method, do_log,
do_log, time_kernel,
time_kernel, length,
length, stride,
stride, reduce,
reduce, float(alpha),
float(alpha), float(beta));
float(beta),
norm_type);
} }
else if(data_type == NormDataType::F32_F32) else if(data_type == SoftmaxDataType::F32_F32)
{ {
ck::profiler::profile_normalization_impl<float, float, float, 4>(do_verification, ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
init_method, init_method,
do_log, do_log,
time_kernel, time_kernel,
length, length,
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta));
norm_type);
} }
else else
{ {
......
...@@ -18,9 +18,9 @@ int profile_conv_fwd(int, char*[]); ...@@ -18,9 +18,9 @@ int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_bwd_data(int, char*[]); int profile_conv_bwd_data(int, char*[]);
int profile_conv_bwd_weight(int, char*[]);
int profile_grouped_conv_fwd(int, char*[]); int profile_grouped_conv_fwd(int, char*[]);
int profile_normalization(int, char*[]); int profile_grouped_conv_bwd_weight(int, char*[]);
int profile_softmax(int, char*[]);
int profile_layernorm(int, char*[]); int profile_layernorm(int, char*[]);
int profile_groupnorm(int, char*[]); int profile_groupnorm(int, char*[]);
int profile_reduce(int, char*[]); int profile_reduce(int, char*[]);
...@@ -43,8 +43,9 @@ static void print_helper_message() ...@@ -43,8 +43,9 @@ static void print_helper_message()
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n" " conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n" " conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
" conv_bwd_data: Convolution Backward Data\n" " conv_bwd_data: Convolution Backward Data\n"
" conv_bwd_weight: Convolution Backward Weight\n"
" grouped_conv_fwd: Grouped Convolution Forward\n" " grouped_conv_fwd: Grouped Convolution Forward\n"
" grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
" softmax: Softmax\n"
" reduce: Reduce\n"); " reduce: Reduce\n");
// clang-format on // clang-format on
} }
...@@ -117,21 +118,21 @@ int main(int argc, char* argv[]) ...@@ -117,21 +118,21 @@ int main(int argc, char* argv[])
{ {
return profile_conv_bwd_data(argc, argv); return profile_conv_bwd_data(argc, argv);
} }
else if(strcmp(argv[1], "conv_bwd_weight") == 0)
{
return profile_conv_bwd_weight(argc, argv);
}
else if(strcmp(argv[1], "grouped_conv_fwd") == 0) else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
{ {
return profile_grouped_conv_fwd(argc, argv); return profile_grouped_conv_fwd(argc, argv);
} }
else if(strcmp(argv[1], "conv_bwd_weight") == 0)
{
return profile_grouped_conv_bwd_weight(argc, argv);
}
else if(strcmp(argv[1], "reduce") == 0) else if(strcmp(argv[1], "reduce") == 0)
{ {
return profile_reduce(argc, argv); return profile_reduce(argc, argv);
} }
else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0) else if(strcmp(argv[1], "softmax") == 0)
{ {
return profile_normalization(argc, argv); return profile_softmax(argc, argv);
} }
else if(strcmp(argv[1], "layernorm") == 0) else if(strcmp(argv[1], "layernorm") == 0)
{ {
......
...@@ -11,7 +11,7 @@ cmake ...@@ -11,7 +11,7 @@ cmake
-D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=ON \ -D BUILD_DEV=ON \
-D GPU_TARGETS=gfx908;gfx90a \ -D GPU_TARGETS="gfx908;gfx90a" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
......
...@@ -11,7 +11,7 @@ cmake ...@@ -11,7 +11,7 @@ cmake
-D CMAKE_CXX_FLAGS="-O3" \ -D CMAKE_CXX_FLAGS="-O3" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=OFF \ -D BUILD_DEV=OFF \
-D GPU_TARGETS=gfx908;gfx90a \ -D GPU_TARGETS="gfx908;gfx90a" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
......
...@@ -81,7 +81,7 @@ def parse_logfile(logfile): ...@@ -81,7 +81,7 @@ def parse_logfile(logfile):
StrideA=[] StrideA=[]
StrideB=[] StrideB=[]
StrideC=[] StrideC=[]
if 'perf_gemm' in logfile: if 'perf_gemm.log' in logfile:
for line in open(logfile): for line in open(logfile):
if 'Best Perf' in line: if 'Best Perf' in line:
lst=line.split() lst=line.split()
...@@ -120,14 +120,14 @@ def parse_logfile(logfile): ...@@ -120,14 +120,14 @@ def parse_logfile(logfile):
res = [x for _,x in sorted(zip(tests,tflops))] res = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))] #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1)) test_list=list(range(1,len(tests)+1))
#parse conv_fwd performance tests: #parse conv_fwd and conv_bwd performance tests:
elif 'conv_fwd' in logfile: elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile:
for line in open(logfile): for line in open(logfile):
if 'tflops:' in line: if 'tflops:' in line:
lst=line.split() lst=line.split()
res.append(lst[1]) res.append(lst[1])
#parse all other performance tests: #parse all other performance tests:
elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'conv_bwd_data' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile: elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile:
for line in open(logfile): for line in open(logfile):
if 'Best Perf' in line: if 'Best Perf' in line:
lst=line.split() lst=line.split()
...@@ -149,7 +149,7 @@ def store_new_test_result(table_name, test_results, testlist, branch_name, node_ ...@@ -149,7 +149,7 @@ def store_new_test_result(table_name, test_results, testlist, branch_name, node_
df=pd.DataFrame(data=[params],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Environment','Datetime']) df=pd.DataFrame(data=[params],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Environment','Datetime'])
df_add=pd.DataFrame(data=[test_results],columns=testlist) df_add=pd.DataFrame(data=[test_results],columns=testlist)
df=pd.concat([df,df_add],axis=1) df=pd.concat([df,df_add],axis=1)
print("new test results dataframe:",df) #print("new test results dataframe:",df)
df.to_sql(table_name,connection,if_exists='append',index=False) df.to_sql(table_name,connection,if_exists='append',index=False)
return 0 return 0
...@@ -165,7 +165,7 @@ def compare_test_to_baseline(baseline,test,testlist): ...@@ -165,7 +165,7 @@ def compare_test_to_baseline(baseline,test,testlist):
print("test # ",i,"shows regression by {:.3f}%".format( print("test # ",i,"shows regression by {:.3f}%".format(
(float(test[i])-base_list[i])/base_list[i]*100)) (float(test[i])-base_list[i])/base_list[i]*100))
regression=1 regression=1
ave_perf=ave_perf+float(test[i])/base_list[i] if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i]
if regression==0: if regression==0:
print("no regressions found") print("no regressions found")
ave_perf=ave_perf/len(base_list) ave_perf=ave_perf/len(base_list)
...@@ -248,7 +248,7 @@ def main(): ...@@ -248,7 +248,7 @@ def main():
conn = sqlEngine.connect() conn = sqlEngine.connect()
#save gemm performance tests: #save gemm performance tests:
if 'perf_gemm' in filename: if 'perf_gemm.log' in filename:
#write the ck_gemm_test_params table only needed once the test set changes #write the ck_gemm_test_params table only needed once the test set changes
#post_test_params(test_list,conn) #post_test_params(test_list,conn)
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
......
...@@ -6,11 +6,10 @@ include(googletest) ...@@ -6,11 +6,10 @@ include(googletest)
add_custom_target(tests) add_custom_target(tests)
function(add_test_executable TEST_NAME) function(add_test_executable TEST_NAME)
message("adding test ${TEST_NAME}") message("adding test ${TEST_NAME}")
add_executable(${TEST_NAME} ${ARGN}) add_executable(${TEST_NAME} ${ARGN})
add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> ) add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
add_dependencies(tests ${TEST_NAME}) add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME}) add_dependencies(check ${TEST_NAME})
rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
...@@ -23,14 +22,14 @@ function(add_gtest_executable TEST_NAME) ...@@ -23,14 +22,14 @@ function(add_gtest_executable TEST_NAME)
add_executable(${TEST_NAME} ${ARGN}) add_executable(${TEST_NAME} ${ARGN})
add_dependencies(tests ${TEST_NAME}) add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME}) add_dependencies(check ${TEST_NAME})
# suppress gtest warnings # suppress gtest warnings
target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef) target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
target_link_libraries(${TEST_NAME} PRIVATE gtest_main) target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
gtest_discover_tests(${TEST_NAME}) add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
endfunction(add_gtest_executable TEST_NAME) endfunction(add_gtest_executable TEST_NAME)
add_subdirectory(magic_number_division) add_subdirectory(magic_number_division)
add_subdirectory(space_filling_curve) add_subdirectory(space_filling_curve)
add_subdirectory(conv_util) add_subdirectory(conv_util)
...@@ -42,14 +41,15 @@ add_subdirectory(batched_gemm) ...@@ -42,14 +41,15 @@ add_subdirectory(batched_gemm)
add_subdirectory(batched_gemm_reduce) add_subdirectory(batched_gemm_reduce)
add_subdirectory(batched_gemm_gemm) add_subdirectory(batched_gemm_gemm)
add_subdirectory(batched_gemm_softmax_gemm) add_subdirectory(batched_gemm_softmax_gemm)
add_subdirectory(batched_gemm_masking_scale_softmax_gemm_permute) add_subdirectory(batched_gemm_softmax_gemm_permute)
add_subdirectory(grouped_gemm) add_subdirectory(grouped_gemm)
add_subdirectory(reduce) add_subdirectory(reduce)
add_subdirectory(convnd_fwd) add_subdirectory(convnd_fwd)
add_subdirectory(convnd_bwd_weight)
add_subdirectory(convnd_bwd_data) add_subdirectory(convnd_bwd_data)
add_subdirectory(grouped_convnd_fwd) add_subdirectory(grouped_convnd_fwd)
add_subdirectory(grouped_convnd_bwd_weight)
add_subdirectory(block_to_ctile_map) add_subdirectory(block_to_ctile_map)
add_subdirectory(softmax) add_subdirectory(softmax)
add_subdirectory(layernorm) add_subdirectory(normalization)
add_subdirectory(data_type) add_subdirectory(data_type)
add_subdirectory(elementwise_normalization)
...@@ -2,3 +2,14 @@ add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp) ...@@ -2,3 +2,14 @@ add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_fp16 PRIVATE utility) target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance) target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "profiler/include/profile_batched_gemm_impl.hpp"
namespace {
using ADataType = ck::bhalf_t;
using BDataType = ck::bhalf_t;
using CDataType = ck::bhalf_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
} // namespace
int main()
{
int M = 256;
int N = 256;
int K = 128;
int BatchCount = 3;
bool pass = true;
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
return pass ? 0 : 1;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "profiler/include/profile_batched_gemm_impl.hpp"
namespace {
using ADataType = float;
using BDataType = float;
using CDataType = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
} // namespace
int main()
{
int M = 256;
int N = 256;
int K = 128;
int BatchCount = 3;
bool pass = true;
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
return pass ? 0 : 1;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment