Commit 21892202 authored by Chao Liu's avatar Chao Liu
Browse files

update example

parent d789a53d
add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp) add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp) add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
...@@ -18,16 +18,74 @@ ...@@ -18,16 +18,74 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "parse_conv_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{
ck::tensor_operation::device::ConvParams params;
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
typename AccDataType, typename AccDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InElementOp, typename InElementOp,
typename WeiElementOp, typename WeiElementOp,
typename OutElementOp, typename OutElementOp,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include "convnd_fwd_common.hpp"
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "parse_conv_parameter.hpp"
using InDataType = ck::bhalf_t; using InDataType = ck::bhalf_t;
using WeiDataType = ck::bhalf_t; using WeiDataType = ck::bhalf_t;
...@@ -28,10 +11,6 @@ using AccDataType = float; ...@@ -28,10 +11,6 @@ using AccDataType = float;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
...@@ -87,151 +66,77 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd< ...@@ -87,151 +66,77 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = true; bool time_kernel = false;
int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::tensor_operation::device::ConvParams params; ck::tensor_operation::device::ConvParams params;
if(argc >= 5) if(argc == 1)
{
// use default
}
else if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
} }
else
if(argc >= 6)
{ {
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, argc, argv); params = parse_conv_params(num_dim_spatial, argc, argv);
} }
auto f_nchw_host_tensor_descriptor = if(num_dim_spatial == 1)
[](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(c)};
nhwc_lengths.insert(
nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(nhwc_lengths), std::vector<std::size_t>({0, 3, 1, 2}));
};
Tensor<InDataType> input(
f_nchw_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
Tensor<InDataType> weights(
f_nchw_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
Tensor<InDataType> host_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<InDataType> device_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{ {
case 0: break; return run_conv_fwd<1,
case 1: InDataType,
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); WeiDataType,
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); OutDataType,
break; AccDataType,
default: InElementOp,
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); WeiElementOp,
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); OutElementOp,
DeviceConvNDFwdInstance<1>,
ReferenceConvNDFwdInstance<1>>(
params, do_verification, init_method, time_kernel);
} }
else if(num_dim_spatial == 2)
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do Conv
auto conv = DeviceConvNDFwdInstance<2>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{ {
throw std::runtime_error( return run_conv_fwd<2,
"wrong! device_conv with the specified compilation parameters does " InDataType,
"not support this Conv problem"); WeiDataType,
OutDataType,
AccDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNDFwdInstance<2>,
ReferenceConvNDFwdInstance<2>>(
params, do_verification, init_method, time_kernel);
} }
else if(num_dim_spatial == 3)
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = params.GetFlops();
std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{ {
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output]( return run_conv_fwd<3,
const auto& ref_conv) { InDataType,
auto ref_invoker = ref_conv.MakeInvoker(); WeiDataType,
auto ref_argument = ref_conv.MakeArgument(input, OutDataType,
weights, AccDataType,
host_output, InElementOp,
params.conv_filter_strides_, WeiElementOp,
params.conv_filter_dilations_, OutElementOp,
params.input_left_pads_, DeviceConvNDFwdInstance<3>,
params.input_right_pads_, ReferenceConvNDFwdInstance<3>>(
InElementOp{}, params, do_verification, init_method, time_kernel);
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(host_output.mData,
device_output.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
};
switch(num_dim_spatial)
{
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
return verify_f(ref_conv);
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
return verify_f(ref_conv);
}
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
return verify_f(ref_conv);
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
} }
return 0; return 0;
} }
...@@ -11,10 +11,6 @@ using AccDataType = float; ...@@ -11,10 +11,6 @@ using AccDataType = float;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
...@@ -106,9 +102,6 @@ int main(int argc, char* argv[]) ...@@ -106,9 +102,6 @@ int main(int argc, char* argv[])
WeiDataType, WeiDataType,
OutDataType, OutDataType,
AccDataType, AccDataType,
InLayout,
WeiLayout,
OutLayout,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
...@@ -123,9 +116,6 @@ int main(int argc, char* argv[]) ...@@ -123,9 +116,6 @@ int main(int argc, char* argv[])
WeiDataType, WeiDataType,
OutDataType, OutDataType,
AccDataType, AccDataType,
InLayout,
WeiLayout,
OutLayout,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
...@@ -140,9 +130,6 @@ int main(int argc, char* argv[]) ...@@ -140,9 +130,6 @@ int main(int argc, char* argv[])
WeiDataType, WeiDataType,
OutDataType, OutDataType,
AccDataType, AccDataType,
InLayout,
WeiLayout,
OutLayout,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
...@@ -150,4 +137,6 @@ int main(int argc, char* argv[]) ...@@ -150,4 +137,6 @@ int main(int argc, char* argv[])
ReferenceConvNDFwdInstance<3>>( ReferenceConvNDFwdInstance<3>>(
params, do_verification, init_method, time_kernel); params, do_verification, init_method, time_kernel);
} }
return 0;
} }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include "convnd_fwd_common.hpp"
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace {
using InDataType = float; using InDataType = float;
using WeiDataType = float; using WeiDataType = float;
...@@ -35,48 +18,42 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -35,48 +18,42 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
using DeviceConvFwdBasePtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
template <ck::index_t NumDimSpatial> template <ck::index_t NumDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device:: using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< InDataType, //
// clang-format off WeiDataType, //
InDataType, // OutDataType, //
WeiDataType, // AccDataType, //
OutDataType, // InElementOp, // Input Elementwise Operation
AccDataType, // WeiElementOp, // Weights Elementwise Operation
InElementOp, // Input Elementwise Operation OutElementOp, // Output Elementwise Operation
WeiElementOp, // Weights Elementwise Operation ConvFwdDefault, // ConvForwardSpecialization
OutElementOp, // Output Elementwise Operation NumDimSpatial, // NumDimSpatial
ConvFwdDefault, // ConvForwardSpecialization 256, // BlockSize
NumDimSpatial, // NumDimSpatial 256, // MPerBlock
256, // BlockSize 128, // NPerBlock
256, // MPerBlock 4, // K0PerBlock
128, // NPerBlock 4, // K1
4, // K0PerBlock 32, // MPerXDL
4, // K1 32, // NPerXDL
32, // MPerXDL 4, // MXdlPerWave
32, // NPerXDL 2, // NXdlPerWave
4, // MXdlPerWave S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
2, // NXdlPerWave S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<1, 0, 2>, // ABlockTransferSrcAccessOrder
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder 2, // ABlockTransferSrcVectorDim
S<1, 0, 2>, // ABlockTransferSrcAccessOrder 4, // ABlockTransferSrcScalarPerVector
2, // ABlockTransferSrcVectorDim 4, // ABlockTransferDstScalarPerVector_K1
4, // ABlockTransferSrcScalarPerVector true, // ABlockLdsAddExtraM
4, // ABlockTransferDstScalarPerVector_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
true, // ABlockLdsAddExtraM S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<1, 0, 2>, // BBlockTransferSrcAccessOrder
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder 2, // BBlockTransferSrcVectorDim
S<1, 0, 2>, // BBlockTransferSrcAccessOrder 4, // BBlockTransferSrcScalarPerVector
2, // BBlockTransferSrcVectorDim 4, // BBlockTransferDstScalarPerVector_K1
4, // BBlockTransferSrcScalarPerVector true, // BBlockTransferAddExtraN
4, // BBlockTransferDstScalarPerVector_K1 7, // CThreadTransferSrcDstVectorDim
true, // BBlockTransferAddExtraN 1>; // CThreadTransferDstScalarPerVector
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
// clang-format on
template <ck::index_t NumDimSpatial> template <ck::index_t NumDimSpatial>
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType, using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
...@@ -87,260 +64,79 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd< ...@@ -87,260 +64,79 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<
OutElementOp, OutElementOp,
NumDimSpatial>; NumDimSpatial>;
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 3: {
return std::make_unique<DeviceConvNDFwdInstance<3>>();
}
case 2: {
return std::make_unique<DeviceConvNDFwdInstance<2>>();
}
case 1: {
return std::make_unique<DeviceConvNDFwdInstance<1>>();
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 5;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(0);
}
ck::utils::conv::ConvParams params;
int arg_idx = 5;
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // anonymous namespace
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck::utils::conv; print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
if(argc >= 5) if(argc == 1)
{
// use default
}
else if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
} }
else
if(argc >= 6)
{ {
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, argc, argv); params = parse_conv_params(num_dim_spatial, argc, argv);
} }
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_), if(num_dim_spatial == 1)
static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{ {
case 0: break; return run_conv_fwd<1,
case 1: InDataType,
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); WeiDataType,
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); OutDataType,
break; AccDataType,
default: InElementOp,
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); WeiElementOp,
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); OutElementOp,
} DeviceConvNDFwdInstance<1>,
ReferenceConvNDFwdInstance<1>>(
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); params, do_verification, init_method, time_kernel);
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); }
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); else if(num_dim_spatial == 2)
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = get_conv_instance(num_dim_spatial);
auto invoker = conv->MakeInvokerPointer();
auto argument =
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv->IsSupportedArgument(argument.get()))
{ {
throw std::runtime_error( return run_conv_fwd<2,
"wrong! device_conv with the specified compilation parameters does " InDataType,
"not support this Conv problem"); WeiDataType,
} OutDataType,
AccDataType,
float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); InElementOp,
WeiElementOp,
std::size_t flop = get_flops( OutElementOp,
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); DeviceConvNDFwdInstance<2>,
std::size_t num_btype = ReferenceConvNDFwdInstance<2>>(
get_btype<InDataType, WeiDataType, OutDataType>(params.N_, params, do_verification, init_method, time_kernel);
params.C_, }
params.K_, else if(num_dim_spatial == 3)
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{ {
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output]( return run_conv_fwd<3,
const auto& ref_conv) { InDataType,
auto ref_invoker = ref_conv.MakeInvoker(); WeiDataType,
auto ref_argument = ref_conv.MakeArgument(input, OutDataType,
weights, AccDataType,
host_output, InElementOp,
params.conv_filter_strides_, WeiElementOp,
params.conv_filter_dilations_, OutElementOp,
params.input_left_pads_, DeviceConvNDFwdInstance<3>,
params.input_right_pads_, ReferenceConvNDFwdInstance<3>>(
InElementOp{}, params, do_verification, init_method, time_kernel);
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(device_output.mData,
host_output.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
};
switch(num_dim_spatial)
{
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
return verify_f(ref_conv);
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
return verify_f(ref_conv);
}
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
return verify_f(ref_conv);
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
} }
return 0; return 0;
} }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include "convnd_fwd_common.hpp"
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace {
using InDataType = double; using InDataType = double;
using WeiDataType = double; using WeiDataType = double;
...@@ -35,48 +18,42 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -35,48 +18,42 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
using DeviceConvFwdBasePtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
template <ck::index_t NumDimSpatial> template <ck::index_t NumDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device:: using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< InDataType, //
// clang-format off WeiDataType, //
InDataType, // OutDataType, //
WeiDataType, // AccDataType, //
OutDataType, // InElementOp, // Input Elementwise Operation
AccDataType, // WeiElementOp, // Weights Elementwise Operation
InElementOp, // Input Elementwise Operation OutElementOp, // Output Elementwise Operation
WeiElementOp, // Weights Elementwise Operation ConvFwdDefault, // ConvForwardSpecialization
OutElementOp, // Output Elementwise Operation NumDimSpatial, // NumDimSpatial
ConvFwdDefault, // ConvForwardSpecialization 256, // BlockSize
NumDimSpatial, // NumDimSpatial 128, // MPerBlock
256, // BlockSize 128, // NPerBlock
128, // MPerBlock 4, // K0PerBlock
128, // NPerBlock 2, // K1
4, // K0PerBlock 16, // MPerXDL
2, // K1 16, // NPerXDL
16, // MPerXDL 4, // MXdlPerWave
16, // NPerXDL 4, // NXdlPerWave
4, // MXdlPerWave S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
4, // NXdlPerWave S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<1, 0, 2>, // ABlockTransferSrcAccessOrder
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder 2, // ABlockTransferSrcVectorDim
S<1, 0, 2>, // ABlockTransferSrcAccessOrder 2, // ABlockTransferSrcScalarPerVector
2, // ABlockTransferSrcVectorDim 2, // ABlockTransferDstScalarPerVector_K1
2, // ABlockTransferSrcScalarPerVector true, // ABlockLdsAddExtraM
2, // ABlockTransferDstScalarPerVector_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
true, // ABlockLdsAddExtraM S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<1, 0, 2>, // BBlockTransferSrcAccessOrder
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder 2, // BBlockTransferSrcVectorDim
S<1, 0, 2>, // BBlockTransferSrcAccessOrder 2, // BBlockTransferSrcScalarPerVector
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferDstScalarPerVector_K1
2, // BBlockTransferSrcScalarPerVector true, // BBlockTransferAddExtraN
2, // BBlockTransferDstScalarPerVector_K1 7, // CThreadTransferSrcDstVectorDim
true, // BBlockTransferAddExtraN 1>; // CThreadTransferDstScalarPerVector
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
// clang-format on
template <ck::index_t NumDimSpatial> template <ck::index_t NumDimSpatial>
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType, using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
...@@ -87,261 +64,79 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd< ...@@ -87,261 +64,79 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<
OutElementOp, OutElementOp,
NumDimSpatial>; NumDimSpatial>;
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial) int main(int argc, char* argv[])
{
switch(num_dim_spatial)
{
case 3: {
return std::make_unique<DeviceConvNDFwdInstance<3>>();
}
case 2: {
return std::make_unique<DeviceConvNDFwdInstance<2>>();
}
case 1: {
return std::make_unique<DeviceConvNDFwdInstance<1>>();
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: run kernel # of times (>1)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
{ {
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) print_helper_msg();
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 5;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(0);
}
ck::utils::conv::ConvParams params; bool do_verification = true;
int arg_idx = 5; int init_method = 1;
bool time_kernel = false;
int num_dim_spatial = 2;
params.num_dim_spatial_ = num_dim_spatial; ck::tensor_operation::device::ConvParams params;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial); if(argc == 1)
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); // use default
} }
params.input_left_pads_.resize(num_dim_spatial); else if(argc == 4)
for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
} }
params.input_right_pads_.resize(num_dim_spatial); else
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // anonymous namespace
int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
bool do_verification = 0;
int init_method = 0;
bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
if(argc >= 5)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); num_dim_spatial = std::stoi(argv[4]);
}
if(argc >= 6)
{
params = parse_conv_params(num_dim_spatial, argc, argv); params = parse_conv_params(num_dim_spatial, argc, argv);
} }
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_), if(num_dim_spatial == 1)
static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{ {
case 0: break; return run_conv_fwd<1,
case 1: InDataType,
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); WeiDataType,
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); OutDataType,
break; AccDataType,
case 2: InElementOp,
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); WeiElementOp,
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); OutElementOp,
break; DeviceConvNDFwdInstance<1>,
default: ReferenceConvNDFwdInstance<1>>(
input.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); params, do_verification, init_method, time_kernel);
weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1}); }
} else if(num_dim_spatial == 2)
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = get_conv_instance(num_dim_spatial);
auto invoker = conv->MakeInvokerPointer();
auto argument =
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv->IsSupportedArgument(argument.get()))
{ {
throw std::runtime_error( return run_conv_fwd<2,
"wrong! device_conv with the specified compilation parameters does " InDataType,
"not support this Conv problem"); WeiDataType,
} OutDataType,
AccDataType,
float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); InElementOp,
WeiElementOp,
std::size_t flop = get_flops( OutElementOp,
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); DeviceConvNDFwdInstance<2>,
std::size_t num_btype = ReferenceConvNDFwdInstance<2>>(
get_btype<InDataType, WeiDataType, OutDataType>(params.N_, params, do_verification, init_method, time_kernel);
params.C_, }
params.K_, else if(num_dim_spatial == 3)
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{ {
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output]( return run_conv_fwd<3,
const auto& ref_conv) { InDataType,
auto ref_invoker = ref_conv.MakeInvoker(); WeiDataType,
auto ref_argument = ref_conv.MakeArgument(input, OutDataType,
weights, AccDataType,
host_output, InElementOp,
params.conv_filter_strides_, WeiElementOp,
params.conv_filter_dilations_, OutElementOp,
params.input_left_pads_, DeviceConvNDFwdInstance<3>,
params.input_right_pads_, ReferenceConvNDFwdInstance<3>>(
InElementOp{}, params, do_verification, init_method, time_kernel);
WeiElementOp{}, }
OutElementOp{});
return 0;
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
ck::utils::check_err(
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
};
switch(num_dim_spatial)
{
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
verify_f(ref_conv);
break;
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
verify_f(ref_conv);
break;
}
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
verify_f(ref_conv);
break;
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
} }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include "convnd_fwd_common.hpp"
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace {
using InDataType = int8_t; using InDataType = int8_t;
using WeiDataType = int8_t; using WeiDataType = int8_t;
...@@ -28,60 +11,49 @@ using AccDataType = int32_t; ...@@ -28,60 +11,49 @@ using AccDataType = int32_t;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
using DeviceConvFwdBasePtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
template <ck::index_t NumDimSpatial> template <ck::index_t NumDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device:: using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< InDataType, //
// clang-format off WeiDataType, //
InDataType, // OutDataType, //
WeiDataType, // AccDataType, //
OutDataType, // InElementOp, // Input Elementwise Operation
AccDataType, // WeiElementOp, // Weights Elementwise Operation
InElementOp, // Input Elementwise Operation OutElementOp, // Output Elementwise Operation
WeiElementOp, // Weights Elementwise Operation ConvFwdDefault, // ConvForwardSpecialization
OutElementOp, // Output Elementwise Operation NumDimSpatial, // NumDimSpatial
ConvFwdDefault, // ConvForwardSpecialization 256, // BlockSize
NumDimSpatial, // NumDimSpatial 128, // MPerBlock
256, // BlockSize 256, // NPerBlock
128, // MPerBlock 4, // K0PerBlock
256, // NPerBlock 16, // K1
4, // K0PerBlock 32, // MPerXdl
16, // K1 32, // NPerXdl
32, // MPerXdl 2, // MXdlPerWave
32, // NPerXdl 4, // NXdlPerWave
2, // MXdlPerWave S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
4, // NXdlPerWave S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<1, 0, 2>, // ABlockTransferSrcAccessOrder
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder 2, // ABlockTransferSrcVectorDim
S<1, 0, 2>, // ABlockTransferSrcAccessOrder 16, // ABlockTransferSrcScalarPerVector
2, // ABlockTransferSrcVectorDim 16, // ABlockTransferDstScalarPerVector_K1
16, // ABlockTransferSrcScalarPerVector true, // ABlockLdsAddExtraM
16, // ABlockTransferDstScalarPerVector_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
true, // ABlockLdsAddExtraM S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<1, 0, 2>, // BBlockTransferSrcAccessOrder
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder 2, // BBlockTransferSrcVectorDim
S<1, 0, 2>, // BBlockTransferSrcAccessOrder 16, // BBlockTransferSrcScalarPerVector
2, // BBlockTransferSrcVectorDim 16, // BBlockTransferDstScalarPerVector_K1
16, // BBlockTransferSrcScalarPerVector true, // BBlockLdsAddExtraN
16, // BBlockTransferDstScalarPerVector_K1 7, // CThreadTransferSrcDstVectorDim
true, // BBlockLdsAddExtraN 1>; // CThreadTransferDstScalarPerVector
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
template <ck::index_t NumDimSpatial> template <ck::index_t NumDimSpatial>
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType, using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
...@@ -92,253 +64,79 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd< ...@@ -92,253 +64,79 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<
OutElementOp, OutElementOp,
NumDimSpatial>; NumDimSpatial>;
DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
{
switch(num_dim_spatial)
{
case 3: {
return std::make_unique<DeviceConvNDFwdInstance<3>>();
}
case 2: {
return std::make_unique<DeviceConvNDFwdInstance<2>>();
}
case 1: {
return std::make_unique<DeviceConvNDFwdInstance<1>>();
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 5;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(0);
}
ck::utils::conv::ConvParams params;
int arg_idx = 5;
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // anonymous namespace
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck::utils::conv; print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::utils::conv::ConvParams params; ck::tensor_operation::device::ConvParams params;
if(argc >= 5) if(argc == 1)
{
// use default
}
else if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
} }
else
if(argc >= 6)
{ {
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, argc, argv); params = parse_conv_params(num_dim_spatial, argc, argv);
} }
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_), if(num_dim_spatial == 1)
static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{ {
case 0: break; return run_conv_fwd<1,
case 1: InDataType,
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); WeiDataType,
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); OutDataType,
break; AccDataType,
default: InElementOp,
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); WeiElementOp,
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); OutElementOp,
} DeviceConvNDFwdInstance<1>,
ReferenceConvNDFwdInstance<1>>(
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); params, do_verification, init_method, time_kernel);
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); }
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); else if(num_dim_spatial == 2)
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = get_conv_instance(num_dim_spatial);
auto invoker = conv->MakeInvokerPointer();
auto argument =
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv->IsSupportedArgument(argument.get()))
{ {
throw std::runtime_error( return run_conv_fwd<2,
"wrong! device_conv with the specified compilation parameters does " InDataType,
"not support this Conv problem"); WeiDataType,
} OutDataType,
AccDataType,
float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); InElementOp,
WeiElementOp,
std::size_t flop = get_flops( OutElementOp,
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); DeviceConvNDFwdInstance<2>,
std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>( ReferenceConvNDFwdInstance<2>>(
params.N_, params, do_verification, init_method, time_kernel);
params.C_, }
params.K_, else if(num_dim_spatial == 3)
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{ {
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output]( return run_conv_fwd<3,
const auto& ref_conv) { InDataType,
auto ref_invoker = ref_conv.MakeInvoker(); WeiDataType,
auto ref_argument = ref_conv.MakeArgument(input, OutDataType,
weights, AccDataType,
host_output, InElementOp,
params.conv_filter_strides_, WeiElementOp,
params.conv_filter_dilations_, OutElementOp,
params.input_left_pads_, DeviceConvNDFwdInstance<3>,
params.input_right_pads_, ReferenceConvNDFwdInstance<3>>(
InElementOp{}, params, do_verification, init_method, time_kernel);
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1;
};
switch(num_dim_spatial)
{
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
return verify_f(ref_conv);
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
return verify_f(ref_conv);
}
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
return verify_f(ref_conv);
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
} }
return 0; return 0;
} }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
ck::tensor_operation::device::ConvParams
parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
{
ck::tensor_operation::device::ConvParams params;
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment