Commit f5e3a6e8 authored by Chao Liu's avatar Chao Liu
Browse files

update conv example

parent c9b86e0c
add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp) add_example_executable(example_conv2d_fwd_bias_relu_xdl_fp16 conv2d_fwd_bias_relu_xdl_fp16.cpp)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util) target_link_libraries(example_conv2d_fwd_bias_relu_xdl_fp16 PRIVATE utility)
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace { namespace {
...@@ -32,8 +32,10 @@ using InLayout = ck::tensor_layout::convolution::NHWC; ...@@ -32,8 +32,10 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC; using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK; using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu; using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set; static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
...@@ -82,23 +84,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device:: ...@@ -82,23 +84,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
8>; // CBlockTransferScalarPerVector_NWaveNPerXdl 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on // clang-format on
using ReferenceConvFwdInstance = void print_helper_msg()
ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
void PrintUseMsg()
{ {
std::cout << "arg1: verification (0=no, 1=yes)\n" std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n" << "arg3: time kernel (0=no, 1=yes)\n"
<< "Following arguments:\n" << "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n" << " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n" << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n" << " <in_n_hi_wi_c image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n" << " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n" << " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n" << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
...@@ -106,140 +101,151 @@ void PrintUseMsg() ...@@ -106,140 +101,151 @@ void PrintUseMsg()
<< std::endl; << std::endl;
} }
ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[]) ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) const ck::index_t N = std::stoi(argv[arg_idx++]);
int num_dim_spatial = 2; const ck::index_t K = std::stoi(argv[arg_idx++]);
int conv_args = 3 + num_dim_spatial * 6; const ck::index_t C = std::stoi(argv[arg_idx++]);
int cmdline_nargs = conv_args + 4;
if(cmdline_nargs != argc)
{
PrintUseMsg();
exit(0);
}
ck::tensor_operation::device::ConvParams params;
int arg_idx = 4;
params.num_dim_spatial_ = num_dim_spatial; std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
params.N_ = std::stoi(argv[arg_idx++]); std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
params.K_ = std::stoi(argv[arg_idx++]); std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
params.C_ = std::stoi(argv[arg_idx++]); std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
std::vector<ck::index_t> input_left_pads(num_dim_spatial);
std::vector<ck::index_t> input_right_pads(num_dim_spatial);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
} }
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
} }
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); input_left_pads[i] = std::stoi(argv[arg_idx++]);
} }
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); input_right_pads[i] = std::stoi(argv[arg_idx++]);
} }
return params; return ck::tensor_operation::device::ConvParams{num_dim_spatial,
N,
K,
C,
filter_spatial_lengths,
input_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
} }
} // anonymous namespace } // namespace
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
const int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::tensor_operation::device::ConvParams params; ck::tensor_operation::device::ConvParams params{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc >= 4) if(argc == 1)
{
// use default
}
else if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else
if(argc >= 5)
{ {
params = ParseConvParams(argc, argv); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv);
} }
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_), const auto in_element_op = InElementOp{};
static_cast<std::size_t>(params.C_)}; const auto wei_element_op = WeiElementOp{};
input_dims.insert(std::end(input_dims), const auto out_element_op = OutElementOp{};
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_)); auto f_nhwc_host_tensor_descriptor =
[](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_), std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(params.C_)}; static_cast<std::size_t>(c)};
filter_dims.insert(std::end(filter_dims), nhwc_lengths.insert(
std::begin(params.filter_spatial_lengths_), nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
std::end(params.filter_spatial_lengths_));
return HostTensorDescriptor(nhwc_lengths);
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths(); };
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)}; Tensor<InDataType> in_n_hi_wi_c(
output_dims.insert(std::end(output_dims), f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
std::begin(output_spatial_lengths), Tensor<WeiDataType> wei_k_y_x_c(
std::end(output_spatial_lengths)); f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias( Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
Tensor<OutDataType> out_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<OutDataType> out_n_ho_wo_k_device(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "input: " << input.mDesc << std::endl; std::cout << "in_n_hi_wi_c: " << in_n_hi_wi_c.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl; std::cout << "wei_k_y_x_c: " << wei_k_y_x_c.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl; std::cout << "bias_k: " << bias_k.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl; std::cout << "output: " << out_n_ho_wo_k_host.mDesc << std::endl;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
case 1: case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break; break;
default: default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0}); bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); DeviceMem in_device_buf(sizeof(InDataType) * in_n_hi_wi_c.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_y_x_c.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace()); DeviceMem out_device_buf(sizeof(OutDataType) * out_n_ho_wo_k_device.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data()); in_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
wei_device_buf.ToDevice(weights.mData.data()); wei_device_buf.ToDevice(wei_k_y_x_c.mData.data());
bias_device_buf.ToDevice(bias.mData.data()); bias_device_buf.ToDevice(bias_k.mData.data());
// do GEMM
auto conv = DeviceConvFwdInstance{}; auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = auto argument =
...@@ -252,59 +258,90 @@ int main(int argc, char* argv[]) ...@@ -252,59 +258,90 @@ int main(int argc, char* argv[])
params.C_, params.C_,
params.input_spatial_lengths_, params.input_spatial_lengths_,
params.filter_spatial_lengths_, params.filter_spatial_lengths_,
output_spatial_lengths, params.GetOutputSpatialLengths(),
params.conv_filter_strides_, params.conv_filter_strides_,
params.conv_filter_dilations_, params.conv_filter_dilations_,
params.input_left_pads_, params.input_left_pads_,
params.input_right_pads_, params.input_right_pads_,
InElementOp{}, in_element_op,
WeiElementOp{}, wei_element_op,
OutElementOp{}); out_element_op);
if(!conv.IsSupportedArgument(argument)) if(!conv.IsSupportedArgument(argument))
{ {
throw std::runtime_error( throw std::runtime_error(
"wrong! device operator with the specified compilation parameters does " "wrong! device_conv with the specified compilation parameters does "
"not support this problem"); "not support this Conv problem");
} }
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops( std::size_t flop = params.GetFlops();
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
std::size_t num_btype =
get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
params.C_,
params.K_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths) +
sizeof(OutDataType) * (params.K_);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< std::endl; << conv.GetTypeString() << std::endl;
if(do_verification) if(do_verification)
{ {
auto ref_conv = ReferenceConvFwdInstance{}; // use OutDataType for intermediate data
auto ref_invoker = ref_conv.MakeInvoker(); Tensor<OutDataType> tmp_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
auto ref_conv =
ck::tensor_operation::host::ReferenceConvFwd<2,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
PassThrough>();
auto ref_argument = ref_conv.MakeArgument(input, auto ref_invoker = ref_conv.MakeInvoker();
weights, auto ref_argument = ref_conv.MakeArgument(in_n_hi_wi_c,
host_output, wei_k_y_x_c,
bias, tmp_n_ho_wo_k_host,
params.conv_filter_strides_, params.conv_filter_strides_,
params.conv_filter_dilations_, params.conv_filter_dilations_,
params.input_left_pads_, params.input_left_pads_,
params.input_right_pads_, params.input_right_pads_,
InElementOp{}, in_element_op,
WeiElementOp{}, wei_element_op,
OutElementOp{}); PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1; // FIXME: implement reference pointwise operation
for(int n = 0; n < params.N_; n++)
{
for(int ho = 0; ho < params.output_spatial_lengths_[0]; ho++)
{
for(int wo = 0; wo < params.output_spatial_lengths_[1]; wo++)
{
for(int k = 0; k < params.K_; k++)
{
out_element_op(out_n_ho_wo_k_host(n, ho, wo, k),
tmp_n_ho_wo_k_host(n, ho, wo, k),
bias_k(k));
}
}
}
}
out_device_buf.FromDevice(out_n_ho_wo_k_device.mData.data());
return ck::utils::check_err(out_n_ho_wo_k_host.mData,
out_n_ho_wo_k_device.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
} }
return 0; return 0;
......
# FIXME: should fix validation failure add_example_executable(example_conv2d_fwd_bias_relu_add_xdl_fp16 conv2d_fwd_bias_relu_add_xdl_fp16.cpp)
add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp) target_link_libraries(example_conv2d_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util)
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace { namespace {
...@@ -32,8 +32,10 @@ using InLayout = ck::tensor_layout::convolution::NHWC; ...@@ -32,8 +32,10 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC; using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK; using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
...@@ -79,23 +81,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device:: ...@@ -79,23 +81,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
8>; // CBlockTransferScalarPerVector_NWaveNPerXdl 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on // clang-format on
using ReferenceConvFwdInstance = void print_helper_msg()
ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation_Add<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
void PrintUseMsg()
{ {
std::cout << "arg1: verification (0=no, 1=yes)\n" std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n" << "arg3: time kernel (0=no, 1=yes)\n"
<< "Following arguments:\n" << "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n" << " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n" << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n" << " <in_n_hi_wi_c image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n" << " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n" << " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n" << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
...@@ -103,154 +98,157 @@ void PrintUseMsg() ...@@ -103,154 +98,157 @@ void PrintUseMsg()
<< std::endl; << std::endl;
} }
ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[]) ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) const ck::index_t N = std::stoi(argv[arg_idx++]);
int num_dim_spatial = 2; const ck::index_t K = std::stoi(argv[arg_idx++]);
int conv_args = 3 + num_dim_spatial * 6; const ck::index_t C = std::stoi(argv[arg_idx++]);
int cmdline_nargs = conv_args + 4;
if(cmdline_nargs != argc)
{
PrintUseMsg();
exit(0);
}
ck::tensor_operation::device::ConvParams params;
int arg_idx = 4;
params.num_dim_spatial_ = num_dim_spatial; std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
params.N_ = std::stoi(argv[arg_idx++]); std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
params.K_ = std::stoi(argv[arg_idx++]); std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
params.C_ = std::stoi(argv[arg_idx++]); std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
std::vector<ck::index_t> input_left_pads(num_dim_spatial);
std::vector<ck::index_t> input_right_pads(num_dim_spatial);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
} }
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
} }
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); input_left_pads[i] = std::stoi(argv[arg_idx++]);
} }
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); input_right_pads[i] = std::stoi(argv[arg_idx++]);
} }
return params; return ck::tensor_operation::device::ConvParams{num_dim_spatial,
N,
K,
C,
filter_spatial_lengths,
input_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
} }
} // anonymous namespace } // anonymous namespace
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck::utils::conv; print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
const int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::tensor_operation::device::ConvParams params; ck::tensor_operation::device::ConvParams params{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc >= 4) if(argc == 1)
{
// use default
}
else if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else
if(argc >= 5)
{ {
params = ParseConvParams(argc, argv); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv);
} }
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_), const auto in_element_op = InElementOp{};
static_cast<std::size_t>(params.C_)}; const auto wei_element_op = WeiElementOp{};
input_dims.insert(std::end(input_dims), const auto out_element_op = OutElementOp{};
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
// bias: assume contiguous 1d vector auto f_nhwc_host_tensor_descriptor =
Tensor<OutDataType> bias( [](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)}))); std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(c)};
nhwc_lengths.insert(
nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
// residual: assume same layout as output tensor return HostTensorDescriptor(nhwc_lengths);
Tensor<OutDataType> residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); };
std::cout << "input: " << input.mDesc << std::endl; Tensor<InDataType> in_n_hi_wi_c(
std::cout << "weights: " << weights.mDesc << std::endl; f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
std::cout << "output: " << host_output.mDesc << std::endl; Tensor<WeiDataType> wei_k_y_x_c(
std::cout << "bias: " << bias.mDesc << std::endl; f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
std::cout << "residual: " << residual.mDesc << std::endl; // bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
// resi: assume same layout as output tensor
Tensor<OutDataType> resi_n_ho_wo_k(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<OutDataType> out_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<OutDataType> out_n_ho_wo_k_device(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "in_n_hi_wi_c: " << in_n_hi_wi_c.mDesc << std::endl;
std::cout << "wei_k_y_x_c: " << wei_k_y_x_c.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
std::cout << "resi_n_ho_wo_k: " << resi_n_ho_wo_k.mDesc << std::endl;
std::cout << "out_n_ho_wo_k: " << out_n_ho_wo_k_host.mDesc << std::endl;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
case 1: case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2}); in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2}); wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2}); bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
residual.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2}); resi_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break; break;
default: default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0}); bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
residual.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0}); resi_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); DeviceMem in_device_buf(sizeof(InDataType) * in_n_hi_wi_c.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_y_x_c.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace()); DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_ho_wo_k.mDesc.GetElementSpace());
DeviceMem resi_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpace()); DeviceMem out_device_buf(sizeof(OutDataType) * out_n_ho_wo_k_device.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
resi_device_buf.ToDevice(residual.mData.data());
const auto in_element_op = InElementOp{}; in_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
const auto wei_element_op = WeiElementOp{}; wei_device_buf.ToDevice(wei_k_y_x_c.mData.data());
const auto out_element_op = OutElementOp{}; bias_device_buf.ToDevice(bias_k.mData.data());
resi_device_buf.ToDevice(resi_n_ho_wo_k.mData.data());
auto conv = DeviceConvFwdInstance{}; auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
...@@ -265,7 +263,7 @@ int main(int argc, char* argv[]) ...@@ -265,7 +263,7 @@ int main(int argc, char* argv[])
params.C_, params.C_,
params.input_spatial_lengths_, params.input_spatial_lengths_,
params.filter_spatial_lengths_, params.filter_spatial_lengths_,
output_spatial_lengths, params.output_spatial_lengths_,
params.conv_filter_strides_, params.conv_filter_strides_,
params.conv_filter_dilations_, params.conv_filter_dilations_,
params.input_left_pads_, params.input_left_pads_,
...@@ -281,47 +279,75 @@ int main(int argc, char* argv[]) ...@@ -281,47 +279,75 @@ int main(int argc, char* argv[])
"not support this problem"); "not support this problem");
} }
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops( std::size_t flop = params.GetFlops();
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
std::size_t num_btype =
get_btype<InDataType, WeiDataType, OutDataType>(params.N_, float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
params.C_, float gb_per_sec = num_btype / 1.E6 / avg_time;
params.K_, std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
params.input_spatial_lengths_, << conv.GetTypeString() << std::endl;
params.filter_spatial_lengths_,
output_spatial_lengths) +
sizeof(OutDataType) * (params.K_) +
sizeof(OutDataType) *
(params.N_ * params.K_ * output_spatial_lengths[0] * output_spatial_lengths[1]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification) if(do_verification)
{ {
auto ref_conv = ReferenceConvFwdInstance{}; // use OutDataType for intermediate data
auto ref_invoker = ref_conv.MakeInvoker(); Tensor<OutDataType> tmp_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
auto ref_conv =
ck::tensor_operation::host::ReferenceConvFwd<2,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
PassThrough>();
auto ref_argument = ref_conv.MakeArgument(input, auto ref_invoker = ref_conv.MakeInvoker();
weights, auto ref_argument = ref_conv.MakeArgument(in_n_hi_wi_c,
host_output, wei_k_y_x_c,
bias, tmp_n_ho_wo_k_host,
residual,
params.conv_filter_strides_, params.conv_filter_strides_,
params.conv_filter_dilations_, params.conv_filter_dilations_,
params.input_left_pads_, params.input_left_pads_,
params.input_right_pads_, params.input_right_pads_,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1; // FIXME: implement reference pointwise operation
for(int n = 0; n < params.N_; n++)
{
for(int ho = 0; ho < params.output_spatial_lengths_[0]; ho++)
{
for(int wo = 0; wo < params.output_spatial_lengths_[1]; wo++)
{
for(int k = 0; k < params.K_; k++)
{
out_element_op(out_n_ho_wo_k_host(n, ho, wo, k),
tmp_n_ho_wo_k_host(n, ho, wo, k),
bias_k(k),
resi_n_ho_wo_k(n, ho, wo, k));
}
}
}
}
out_device_buf.FromDevice(out_n_ho_wo_k_device.mData.data());
return ck::utils::check_err(out_n_ho_wo_k_host.mData,
out_n_ho_wo_k_device.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
} }
return 0; return 0;
......
...@@ -33,8 +33,8 @@ void print_helper_msg() ...@@ -33,8 +33,8 @@ void print_helper_msg()
<< " <right padding>, (ie RightPy, RightPx for 2D)\n" << " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl; << std::endl;
} }
ck::tensor_operation::device::ConvParams
ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[]) parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{ {
const ck::index_t N = std::stoi(argv[arg_idx++]); const ck::index_t N = std::stoi(argv[arg_idx++]);
......
...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial> template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl< using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, //
InDataType, // InDataType, //
WeiDataType, // WeiDataType, //
OutDataType, // OutDataType, //
...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc ...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize 256, // BlockSize
128, // MPerBlock 128, // MPerBlock
256, // NPerBlock 256, // NPerBlock
......
...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial> template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl< using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial
InDataType, // InDataType, //
WeiDataType, // WeiDataType, //
OutDataType, // OutDataType, //
...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc ...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize 256, // BlockSize
256, // MPerBlock 256, // MPerBlock
128, // NPerBlock 128, // NPerBlock
......
...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial> template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl< using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial
InDataType, // InDataType, //
WeiDataType, // WeiDataType, //
OutDataType, // OutDataType, //
...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc ...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize 256, // BlockSize
128, // MPerBlock 128, // MPerBlock
128, // NPerBlock 128, // NPerBlock
......
...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial> template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl< using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial
InDataType, // InDataType, //
WeiDataType, // WeiDataType, //
OutDataType, // OutDataType, //
...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc ...@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize 256, // BlockSize
128, // MPerBlock 128, // MPerBlock
256, // NPerBlock 256, // NPerBlock
...@@ -89,6 +89,10 @@ int main(int argc, char* argv[]) ...@@ -89,6 +89,10 @@ int main(int argc, char* argv[])
params = parse_conv_params(num_dim_spatial, 5, argv); params = parse_conv_params(num_dim_spatial, 5, argv);
} }
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1) if(num_dim_spatial == 1)
{ {
return run_conv_fwd_nhwc<1, return run_conv_fwd_nhwc<1,
...@@ -99,9 +103,13 @@ int main(int argc, char* argv[]) ...@@ -99,9 +103,13 @@ int main(int argc, char* argv[])
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<1>, DeviceConvNDFwdInstance<1>>(do_verification,
ReferenceConvNDFwdInstance<1>>( init_method,
params, do_verification, init_method, time_kernel); time_kernel,
params,
in_element_op,
wei_element_op,
out_element_op);
} }
else if(num_dim_spatial == 2) else if(num_dim_spatial == 2)
{ {
...@@ -113,9 +121,13 @@ int main(int argc, char* argv[]) ...@@ -113,9 +121,13 @@ int main(int argc, char* argv[])
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<2>, DeviceConvNDFwdInstance<2>>(do_verification,
ReferenceConvNDFwdInstance<2>>( init_method,
params, do_verification, init_method, time_kernel); time_kernel,
params,
in_element_op,
wei_element_op,
out_element_op);
} }
else if(num_dim_spatial == 3) else if(num_dim_spatial == 3)
{ {
...@@ -127,9 +139,13 @@ int main(int argc, char* argv[]) ...@@ -127,9 +139,13 @@ int main(int argc, char* argv[])
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<3>, DeviceConvNDFwdInstance<3>>(do_verification,
ReferenceConvNDFwdInstance<3>>( init_method,
params, do_verification, init_method, time_kernel); time_kernel,
params,
in_element_op,
wei_element_op,
out_element_op);
} }
return 0; return 0;
......
...@@ -28,8 +28,6 @@ add_subdirectory(04_gemm_add_add_fastgelu) ...@@ -28,8 +28,6 @@ add_subdirectory(04_gemm_add_add_fastgelu)
add_subdirectory(06_conv2d_fwd_bias_relu) add_subdirectory(06_conv2d_fwd_bias_relu)
add_subdirectory(07_conv2d_fwd_bias_relu_add) add_subdirectory(07_conv2d_fwd_bias_relu_add)
add_subdirectory(09_convnd_fwd) add_subdirectory(09_convnd_fwd)
add_subdirectory(10_conv2d_bwd_data)
add_subdirectory(11_conv2d_bwd_weight)
add_subdirectory(12_reduce) add_subdirectory(12_reduce)
add_subdirectory(13_pool2d_fwd) add_subdirectory(13_pool2d_fwd)
add_subdirectory(14_gemm_xdl_requant_relu_requant) add_subdirectory(14_gemm_xdl_requant_relu_requant)
......
...@@ -19,7 +19,7 @@ getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecializat ...@@ -19,7 +19,7 @@ getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecializat
switch(s) switch(s)
{ {
case ConvolutionBackwardDataSpecialization::Default: return "Default"; case ConvolutionBackwardDataSpecialization::Default: return "Default";
case ConvolutionBackwardDataSpecialization::FFilter1x1Stride1Pad0: case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
return "FFilter1x1Stride1Pad0"; return "FFilter1x1Stride1Pad0";
default: return "Unrecognized specialization!"; default: return "Unrecognized specialization!";
} }
......
...@@ -117,12 +117,12 @@ struct Bilinear ...@@ -117,12 +117,12 @@ struct Bilinear
struct AddRelu struct AddRelu
{ {
template <typename T> template <typename Y, typename X0, typename X1>
__host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
template <> template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()<float>(float& y, const float& x0, const float& x1) const operator()<float, float, float>(float& y, const float& x0, const float& x1) const
{ {
const float a = x0 + x1; const float a = x0 + x1;
y = a > 0.0f ? a : 0.0f; y = a > 0.0f ? a : 0.0f;
...@@ -130,7 +130,7 @@ struct AddRelu ...@@ -130,7 +130,7 @@ struct AddRelu
template <> template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()<double>(double& y, const double& x0, const double& x1) const operator()<double, double, double>(double& y, const double& x0, const double& x1) const
{ {
const double a = x0 + x1; const double a = x0 + x1;
y = a > 0.0 ? a : 0.0; y = a > 0.0 ? a : 0.0;
...@@ -138,11 +138,19 @@ struct AddRelu ...@@ -138,11 +138,19 @@ struct AddRelu
template <> template <>
__host__ __device__ constexpr void __host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{ {
const half_t a = x0 + x1; const half_t a = x0 + x1;
y = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f); y = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
}; };
template <>
__host__ __device__ constexpr void
operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
{
const float a = x0 + x1;
y = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
};
}; };
struct AddHardswish struct AddHardswish
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment