Commit f5e3a6e8 authored by Chao Liu's avatar Chao Liu
Browse files

update conv example

parent c9b86e0c
add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
add_example_executable(example_conv2d_fwd_bias_relu_xdl_fp16 conv2d_fwd_bias_relu_xdl_fp16.cpp)
target_link_libraries(example_conv2d_fwd_bias_relu_xdl_fp16 PRIVATE utility)
......@@ -16,7 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace {
......@@ -32,8 +32,10 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
......@@ -82,23 +84,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
using ReferenceConvFwdInstance =
ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
void PrintUseMsg()
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <in_n_hi_wi_c image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
......@@ -106,140 +101,151 @@ void PrintUseMsg()
<< std::endl;
}
ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[])
ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int num_dim_spatial = 2;
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 4;
if(cmdline_nargs != argc)
{
PrintUseMsg();
exit(0);
}
ck::tensor_operation::device::ConvParams params;
int arg_idx = 4;
const ck::index_t N = std::stoi(argv[arg_idx++]);
const ck::index_t K = std::stoi(argv[arg_idx++]);
const ck::index_t C = std::stoi(argv[arg_idx++]);
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
std::vector<ck::index_t> input_left_pads(num_dim_spatial);
std::vector<ck::index_t> input_right_pads(num_dim_spatial);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
input_left_pads[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
input_right_pads[i] = std::stoi(argv[arg_idx++]);
}
return params;
return ck::tensor_operation::device::ConvParams{num_dim_spatial,
N,
K,
C,
filter_spatial_lengths,
input_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
}
} // anonymous namespace
} // namespace
int main(int argc, char* argv[])
{
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
const int num_dim_spatial = 2;
int num_dim_spatial = 2;
ck::tensor_operation::device::ConvParams params;
ck::tensor_operation::device::ConvParams params{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc >= 4)
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
else
{
params = ParseConvParams(argc, argv);
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv);
}
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
auto f_nhwc_host_tensor_descriptor =
[](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(c)};
nhwc_lengths.insert(
nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
return HostTensorDescriptor(nhwc_lengths);
};
Tensor<InDataType> in_n_hi_wi_c(
f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
Tensor<WeiDataType> wei_k_y_x_c(
f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias(
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
Tensor<OutDataType> out_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<OutDataType> out_n_ho_wo_k_device(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl;
std::cout << "in_n_hi_wi_c: " << in_n_hi_wi_c.mDesc << std::endl;
std::cout << "wei_k_y_x_c: " << wei_k_y_x_c.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
std::cout << "output: " << out_n_ho_wo_k_host.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
DeviceMem in_device_buf(sizeof(InDataType) * in_n_hi_wi_c.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_y_x_c.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * out_n_ho_wo_k_device.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
in_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
wei_device_buf.ToDevice(wei_k_y_x_c.mData.data());
bias_device_buf.ToDevice(bias_k.mData.data());
// do GEMM
auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument =
......@@ -252,59 +258,90 @@ int main(int argc, char* argv[])
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
in_element_op,
wei_element_op,
out_element_op);
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device operator with the specified compilation parameters does "
"not support this problem");
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype =
get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
params.C_,
params.K_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths) +
sizeof(OutDataType) * (params.K_);
std::size_t flop = params.GetFlops();
std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{
auto ref_conv = ReferenceConvFwdInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
// use OutDataType for intermediate data
Tensor<OutDataType> tmp_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
auto ref_conv =
ck::tensor_operation::host::ReferenceConvFwd<2,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
PassThrough>();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
bias,
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_n_hi_wi_c,
wei_k_y_x_c,
tmp_n_ho_wo_k_host,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
in_element_op,
wei_element_op,
PassThrough{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
// FIXME: implement reference pointwise operation
for(int n = 0; n < params.N_; n++)
{
for(int ho = 0; ho < params.output_spatial_lengths_[0]; ho++)
{
for(int wo = 0; wo < params.output_spatial_lengths_[1]; wo++)
{
for(int k = 0; k < params.K_; k++)
{
out_element_op(out_n_ho_wo_k_host(n, ho, wo, k),
tmp_n_ho_wo_k_host(n, ho, wo, k),
bias_k(k));
}
}
}
}
out_device_buf.FromDevice(out_n_ho_wo_k_device.mData.data());
return ck::utils::check_err(out_n_ho_wo_k_host.mData,
out_n_ho_wo_k_device.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
}
return 0;
......
# FIXME: should fix validation failure
add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util)
add_example_executable(example_conv2d_fwd_bias_relu_add_xdl_fp16 conv2d_fwd_bias_relu_add_xdl_fp16.cpp)
target_link_libraries(example_conv2d_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)
......@@ -16,7 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace {
......@@ -32,8 +32,10 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvFwdDefault =
......@@ -79,23 +81,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
using ReferenceConvFwdInstance =
ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation_Add<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
void PrintUseMsg()
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <in_n_hi_wi_c image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
......@@ -103,154 +98,157 @@ void PrintUseMsg()
<< std::endl;
}
ck::tensor_operation::device::ConvParams ParseConvParams(int argc, char* argv[])
ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int num_dim_spatial = 2;
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 4;
if(cmdline_nargs != argc)
{
PrintUseMsg();
exit(0);
}
ck::tensor_operation::device::ConvParams params;
int arg_idx = 4;
const ck::index_t N = std::stoi(argv[arg_idx++]);
const ck::index_t K = std::stoi(argv[arg_idx++]);
const ck::index_t C = std::stoi(argv[arg_idx++]);
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
std::vector<ck::index_t> input_left_pads(num_dim_spatial);
std::vector<ck::index_t> input_right_pads(num_dim_spatial);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
input_left_pads[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
input_right_pads[i] = std::stoi(argv[arg_idx++]);
}
return params;
return ck::tensor_operation::device::ConvParams{num_dim_spatial,
N,
K,
C,
filter_spatial_lengths,
input_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
}
} // anonymous namespace
int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
const int num_dim_spatial = 2;
int num_dim_spatial = 2;
ck::tensor_operation::device::ConvParams params;
ck::tensor_operation::device::ConvParams params{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc >= 4)
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
else
{
params = ParseConvParams(argc, argv);
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv);
}
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
auto f_nhwc_host_tensor_descriptor =
[](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(c)};
nhwc_lengths.insert(
nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
// residual: assume same layout as output tensor
Tensor<OutDataType> residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
return HostTensorDescriptor(nhwc_lengths);
};
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl;
std::cout << "residual: " << residual.mDesc << std::endl;
Tensor<InDataType> in_n_hi_wi_c(
f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
Tensor<WeiDataType> wei_k_y_x_c(
f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
// resi: assume same layout as output tensor
Tensor<OutDataType> resi_n_ho_wo_k(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<OutDataType> out_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<OutDataType> out_n_ho_wo_k_device(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "in_n_hi_wi_c: " << in_n_hi_wi_c.mDesc << std::endl;
std::cout << "wei_k_y_x_c: " << wei_k_y_x_c.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
std::cout << "resi_n_ho_wo_k: " << resi_n_ho_wo_k.mDesc << std::endl;
std::cout << "out_n_ho_wo_k: " << out_n_ho_wo_k_host.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
residual.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
resi_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
residual.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
in_n_hi_wi_c.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
resi_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
DeviceMem resi_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
resi_device_buf.ToDevice(residual.mData.data());
DeviceMem in_device_buf(sizeof(InDataType) * in_n_hi_wi_c.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_y_x_c.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_ho_wo_k.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * out_n_ho_wo_k_device.mDesc.GetElementSpace());
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
in_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
wei_device_buf.ToDevice(wei_k_y_x_c.mData.data());
bias_device_buf.ToDevice(bias_k.mData.data());
resi_device_buf.ToDevice(resi_n_ho_wo_k.mData.data());
auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker();
......@@ -265,7 +263,7 @@ int main(int argc, char* argv[])
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths,
params.output_spatial_lengths_,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
......@@ -281,47 +279,75 @@ int main(int argc, char* argv[])
"not support this problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype =
get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
params.C_,
params.K_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths) +
sizeof(OutDataType) * (params.K_) +
sizeof(OutDataType) *
(params.N_ * params.K_ * output_spatial_lengths[0] * output_spatial_lengths[1]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
std::size_t flop = params.GetFlops();
std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{
auto ref_conv = ReferenceConvFwdInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
// use OutDataType for intermediate data
Tensor<OutDataType> tmp_n_ho_wo_k_host(
f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
auto ref_conv =
ck::tensor_operation::host::ReferenceConvFwd<2,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
PassThrough>();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
bias,
residual,
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_n_hi_wi_c,
wei_k_y_x_c,
tmp_n_ho_wo_k_host,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op);
PassThrough{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
// FIXME: implement reference pointwise operation
for(int n = 0; n < params.N_; n++)
{
for(int ho = 0; ho < params.output_spatial_lengths_[0]; ho++)
{
for(int wo = 0; wo < params.output_spatial_lengths_[1]; wo++)
{
for(int k = 0; k < params.K_; k++)
{
out_element_op(out_n_ho_wo_k_host(n, ho, wo, k),
tmp_n_ho_wo_k_host(n, ho, wo, k),
bias_k(k),
resi_n_ho_wo_k(n, ho, wo, k));
}
}
}
}
out_device_buf.FromDevice(out_n_ho_wo_k_device.mData.data());
return ck::utils::check_err(out_n_ho_wo_k_host.mData,
out_n_ho_wo_k_device.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
}
return 0;
......
......@@ -33,8 +33,8 @@ void print_helper_msg()
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::tensor_operation::device::ConvParams
ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{
const ck::index_t N = std::stoi(argv[arg_idx++]);
......
......@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial>
template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, //
InDataType, //
WeiDataType, //
OutDataType, //
......@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
......
......@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial>
template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial
InDataType, //
WeiDataType, //
OutDataType, //
......@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
......
......@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial>
template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial
InDataType, //
WeiDataType, //
OutDataType, //
......@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize
128, // MPerBlock
128, // NPerBlock
......
......@@ -20,8 +20,9 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial>
template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial
InDataType, //
WeiDataType, //
OutDataType, //
......@@ -30,7 +31,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
......@@ -89,6 +89,10 @@ int main(int argc, char* argv[])
params = parse_conv_params(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1)
{
return run_conv_fwd_nhwc<1,
......@@ -99,9 +103,13 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNDFwdInstance<1>,
ReferenceConvNDFwdInstance<1>>(
params, do_verification, init_method, time_kernel);
DeviceConvNDFwdInstance<1>>(do_verification,
init_method,
time_kernel,
params,
in_element_op,
wei_element_op,
out_element_op);
}
else if(num_dim_spatial == 2)
{
......@@ -113,9 +121,13 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNDFwdInstance<2>,
ReferenceConvNDFwdInstance<2>>(
params, do_verification, init_method, time_kernel);
DeviceConvNDFwdInstance<2>>(do_verification,
init_method,
time_kernel,
params,
in_element_op,
wei_element_op,
out_element_op);
}
else if(num_dim_spatial == 3)
{
......@@ -127,9 +139,13 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNDFwdInstance<3>,
ReferenceConvNDFwdInstance<3>>(
params, do_verification, init_method, time_kernel);
DeviceConvNDFwdInstance<3>>(do_verification,
init_method,
time_kernel,
params,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
......
......@@ -28,8 +28,6 @@ add_subdirectory(04_gemm_add_add_fastgelu)
add_subdirectory(06_conv2d_fwd_bias_relu)
add_subdirectory(07_conv2d_fwd_bias_relu_add)
add_subdirectory(09_convnd_fwd)
add_subdirectory(10_conv2d_bwd_data)
add_subdirectory(11_conv2d_bwd_weight)
add_subdirectory(12_reduce)
add_subdirectory(13_pool2d_fwd)
add_subdirectory(14_gemm_xdl_requant_relu_requant)
......
......@@ -19,7 +19,7 @@ getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecializat
switch(s)
{
case ConvolutionBackwardDataSpecialization::Default: return "Default";
case ConvolutionBackwardDataSpecialization::FFilter1x1Stride1Pad0:
case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
return "FFilter1x1Stride1Pad0";
default: return "Unrecognized specialization!";
}
......
......@@ -117,12 +117,12 @@ struct Bilinear
struct AddRelu
{
template <typename T>
__host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
template <typename Y, typename X0, typename X1>
__host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
template <>
__host__ __device__ constexpr void
operator()<float>(float& y, const float& x0, const float& x1) const
operator()<float, float, float>(float& y, const float& x0, const float& x1) const
{
const float a = x0 + x1;
y = a > 0.0f ? a : 0.0f;
......@@ -130,7 +130,7 @@ struct AddRelu
template <>
__host__ __device__ constexpr void
operator()<double>(double& y, const double& x0, const double& x1) const
operator()<double, double, double>(double& y, const double& x0, const double& x1) const
{
const double a = x0 + x1;
y = a > 0.0 ? a : 0.0;
......@@ -138,11 +138,19 @@ struct AddRelu
template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
const half_t a = x0 + x1;
y = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
};
template <>
__host__ __device__ constexpr void
operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
{
const float a = x0 + x1;
y = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
};
};
struct AddHardswish
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment