Commit 0e8d7ed3 authored by Chao Liu's avatar Chao Liu
Browse files

update example

parent 61510f0a
...@@ -23,8 +23,8 @@ void print_helper_msg() ...@@ -23,8 +23,8 @@ void print_helper_msg()
std::cout << "arg1: verification (0=no, 1=yes)\n" std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n" << "arg3: time kernel (0=no, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n" << "Following arguments (depending on number of spatial dims):\n"
<< " N spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
<< " G, N, K, C, \n" << " G, N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n" << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n" << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
...@@ -35,7 +35,7 @@ void print_helper_msg() ...@@ -35,7 +35,7 @@ void print_helper_msg()
<< std::endl; << std::endl;
} }
ck::utils::conv::ConvParam parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[]) ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
{ {
const ck::index_t G = std::stoi(argv[arg_idx++]); const ck::index_t G = std::stoi(argv[arg_idx++]);
const ck::index_t N = std::stoi(argv[arg_idx++]); const ck::index_t N = std::stoi(argv[arg_idx++]);
...@@ -92,11 +92,7 @@ ck::utils::conv::ConvParam parse_conv_params(int num_dim_spatial, int arg_idx, c ...@@ -92,11 +92,7 @@ ck::utils::conv::ConvParam parse_conv_params(int num_dim_spatial, int arg_idx, c
input_right_pads}; input_right_pads};
} }
// FIXME: current implementation only support NCHW/NHWC layout
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
...@@ -104,32 +100,24 @@ template <ck::index_t NDimSpatial, ...@@ -104,32 +100,24 @@ template <ck::index_t NDimSpatial,
typename WeiElementOp, typename WeiElementOp,
typename OutElementOp, typename OutElementOp,
typename DeviceConvNDFwdInstance> typename DeviceConvNDFwdInstance>
int run_conv_fwd(bool do_verification, int run_grouped_conv_fwd(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const ck::utils::conv::ConvParam& conv_param, const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& in_g_n_c_wis_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op, const InElementOp& in_element_op,
const WeiElementOp& wei_element_op, const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op) const OutElementOp& out_element_op)
{ {
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_packed<WeiLayout>(conv_param);
const auto bias_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_packed<OutLayout>(conv_param);
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_packed<OutLayout>(conv_param);
Tensor<InDataType> in(in_g_n_c_wis_desc); Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc); Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
Tensor<OutDataType> out_host(out_g_n_k_wos_desc); Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc); Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl; std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl; std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl; std::cout << "out: " << out_host.mDesc << std::endl;
switch(init_method) switch(init_method)
...@@ -138,29 +126,23 @@ int run_conv_fwd(bool do_verification, ...@@ -138,29 +126,23 @@ int run_conv_fwd(bool do_verification,
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
} }
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data()); in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data()); wei_device_buf.ToDevice(wei.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{}; std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{}; std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{}; std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{}; std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{}; std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{}; std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
...@@ -174,8 +156,6 @@ int run_conv_fwd(bool do_verification, ...@@ -174,8 +156,6 @@ int run_conv_fwd(bool do_verification,
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(bias_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
copy(bias_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides);
...@@ -183,20 +163,19 @@ int run_conv_fwd(bool do_verification, ...@@ -183,20 +163,19 @@ int run_conv_fwd(bool do_verification,
copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads); copy(conv_param.input_right_pads_, input_right_pads);
// do GEMM // do Conv
auto conv = DeviceConvNDFwdInstance{}; auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument( auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(), wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()}, std::array<const void*, 0>{},
out_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths, a_g_n_c_wis_lengths,
a_g_n_c_wis_strides, a_g_n_c_wis_strides,
b_g_k_c_xs_lengths, b_g_k_c_xs_lengths,
b_g_k_c_xs_strides, b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}}, std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}}, std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
e_g_n_k_wos_lengths, e_g_n_k_wos_lengths,
e_g_n_k_wos_strides, e_g_n_k_wos_strides,
conv_filter_strides, conv_filter_strides,
...@@ -226,36 +205,28 @@ int run_conv_fwd(bool do_verification, ...@@ -226,36 +205,28 @@ int run_conv_fwd(bool do_verification,
if(do_verification) if(do_verification)
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
Tensor<OutDataType> c_host(out_g_n_k_wos_desc);
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
PassThrough>(); OutElementOp>();
auto ref_invoker = ref_conv.MakeInvoker(); auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in, auto ref_argument = ref_conv.MakeArgument(in,
wei, wei,
c_host, out_host,
conv_param.conv_filter_strides_, conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_, conv_param.conv_filter_dilations_,
conv_param.input_left_pads_, conv_param.input_left_pads_,
conv_param.input_right_pads_, conv_param.input_right_pads_,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
PassThrough{}); out_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
// TODO: implement elementwise operation for host
out_host.ForEach(
[&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err( return ck::utils::check_err(
......
...@@ -3,12 +3,15 @@ ...@@ -3,12 +3,15 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = ck::bhalf_t; using InDataType = ck::bhalf_t;
using WeiDataType = ck::bhalf_t; using WeiDataType = ck::bhalf_t;
using OutDataType = ck::bhalf_t;
using AccDataType = float; using AccDataType = float;
using CShuffleDataType = float;
using OutDataType = ck::bhalf_t;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NDimSpatial> static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
InDataType, // using DeviceGroupedConvNDFwdInstance =
WeiDataType, // ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
OutDataType, // NDimSpatial,
AccDataType, // InLayout,
InElementOp, // Input Elementwise Operation WeiLayout,
WeiElementOp, // Weights Elementwise Operation ck::Tuple<>,
OutElementOp, // Output Elementwise Operation OutLayout,
ConvFwdDefault, // ConvForwardSpecialization InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize 256, // BlockSize
128, // MPerBlock 128, // MPerBlock
256, // NPerBlock 256, // NPerBlock
4, // K0PerBlock 32, // KPerBlock
8, // K1 8, // AK1
8, // BK1
32, // MPerXdl 32, // MPerXdl
32, // NPerXdl 32, // NPerXdl
2, // MXdlPerWave 2, // MXdlPerWave
4, // NXdlPerWave 4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim 2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector 8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1 8, // ABlockTransferDstScalarPerVector_AK1
true, // ABlockLdsAddExtraM 1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector 8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1 8, // BBlockTransferDstScalarPerVector_BK1
true, // BBlockLdsAddExtraN 1, // BBlockLdsExtraN
7, // CThreadTransferSrcDstVectorDim 1,
1>; // CThreadTransferDstScalarPerVector 1,
S<1, 32, 1, 8>,
8>;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParam params{ ck::utils::conv::ConvParam conv_param{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; 2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1) if(argc == 1)
{ {
...@@ -84,71 +102,113 @@ int main(int argc, char* argv[]) ...@@ -84,71 +102,113 @@ int main(int argc, char* argv[])
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); const ck::index_t num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv); conv_param = parse_conv_param(num_dim_spatial, 5, argv);
} }
const auto in_element_op = InElementOp{}; const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1) if(conv_param.num_dim_spatial_ == 1)
{ {
return run_conv_fwd<1, using InLayout = ctc::GNWC;
ck::tensor_layout::convolution::NWC, using WeiLayout = ctc::GKXC;
ck::tensor_layout::convolution::KXC, using OutLayout = ctc::GNWK;
ck::tensor_layout::convolution::NWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
1,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<1>>(do_verification, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 2) else if(conv_param.num_dim_spatial_ == 2)
{ {
return run_conv_fwd<2, using InLayout = ctc::GNHWC;
ck::tensor_layout::convolution::NHWC, using WeiLayout = ctc::GKYXC;
ck::tensor_layout::convolution::KYXC, using OutLayout = ctc::GNHWK;
ck::tensor_layout::convolution::NHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
2,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<2>>(do_verification, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 3) else if(conv_param.num_dim_spatial_ == 3)
{ {
return run_conv_fwd<3, using InLayout = ctc::GNDHWC;
ck::tensor_layout::convolution::NDHWC, using WeiLayout = ctc::GKZYXC;
ck::tensor_layout::convolution::KZYXC, using OutLayout = ctc::GNDHWK;
ck::tensor_layout::convolution::NDHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
3,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<3>>(do_verification, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
......
...@@ -3,114 +3,41 @@ ...@@ -3,114 +3,41 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = ck::half_t; using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float; using AccDataType = float;
using CShuffleDataType = ck::half_t;
using OutDataType = ck::half_t;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
// using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert; using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
#if 0
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, //
InDataType, //
WeiDataType, //
OutDataType, //
AccDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
4, // K0PerBlock
8, // K1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, 1 // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsExtraN
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
#else
using CShuffleDataType = ck::half_t;
using DDataType = ck::half_t;
static constexpr auto ConvSpec = static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvFwdMultipleD_Xdl_CShuffle< using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial, NDimSpatial,
#if 0 InLayout,
ck::tuple_element_t<NDimSpatial - 1, WeiLayout,
ck::Tuple<ck::tensor_layout::convolution::G_NW_C, ck::Tuple<>,
ck::tensor_layout::convolution::G_NHW_C, OutLayout,
ck::tensor_layout::convolution::G_NDHW_C>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::G_K_X_C,
ck::tensor_layout::convolution::G_K_YX_C,
ck::tensor_layout::convolution::G_K_ZYX_C>>,
ck::Tuple<ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::G_NW_K,
ck::tensor_layout::convolution::G_NHW_K,
ck::tensor_layout::convolution::G_NDHW_K>>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::G_NW_K,
ck::tensor_layout::convolution::G_NHW_K,
ck::tensor_layout::convolution::G_NDHW_K>>,
#else
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWGC,
ck::tensor_layout::convolution::NHWGC,
ck::tensor_layout::convolution::NDHWGC>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::KXGC,
ck::tensor_layout::convolution::KYXGC,
ck::tensor_layout::convolution::KZYXGC>>,
ck::Tuple<ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWGK,
ck::tensor_layout::convolution::NHWGK,
ck::tensor_layout::convolution::NDHWGK>>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWGK,
ck::tensor_layout::convolution::NHWGK,
ck::tensor_layout::convolution::NDHWGK>>,
#endif
InDataType, InDataType,
WeiDataType, WeiDataType,
AccDataType, AccDataType,
CShuffleDataType, CShuffleDataType,
ck::Tuple<DDataType>, ck::Tuple<>,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
...@@ -122,41 +49,42 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvFwdMulti ...@@ -122,41 +49,42 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvFwdMulti
128, // MPerBlock 128, // MPerBlock
256, // NPerBlock 256, // NPerBlock
32, // KPerBlock 32, // KPerBlock
8, // K1 8, // AK1
8, // BK1
32, // MPerXdl 32, // MPerXdl
32, // NPerXdl 32, // NPerXdl
2, // MXdlPerWave 2, // MXdlPerWave
4, // NXdlPerWave 4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim 2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector 8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1 8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM 1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector 8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1 8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN 1, // BBlockLdsExtraN
1, 1,
1, 1,
S<1, 32, 1, 8>, S<1, 32, 1, 8>,
8>; 8>;
#endif
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParam params{ ck::utils::conv::ConvParam conv_param{
2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; 2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1) if(argc == 1)
...@@ -174,71 +102,113 @@ int main(int argc, char* argv[]) ...@@ -174,71 +102,113 @@ int main(int argc, char* argv[])
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); const ck::index_t num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv); conv_param = parse_conv_param(num_dim_spatial, 5, argv);
} }
const auto in_element_op = InElementOp{}; const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1) if(conv_param.num_dim_spatial_ == 1)
{ {
return run_conv_fwd<1, using InLayout = ctc::GNWC;
ck::tensor_layout::convolution::NWGC, using WeiLayout = ctc::GKXC;
ck::tensor_layout::convolution::KXGC, using OutLayout = ctc::GNWK;
ck::tensor_layout::convolution::NWGK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
1,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<1>>(do_verification, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 2) else if(conv_param.num_dim_spatial_ == 2)
{ {
return run_conv_fwd<2, using InLayout = ctc::GNHWC;
ck::tensor_layout::convolution::NHWGC, using WeiLayout = ctc::GKYXC;
ck::tensor_layout::convolution::KYXGC, using OutLayout = ctc::GNHWK;
ck::tensor_layout::convolution::NHWGK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
2,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<2>>(do_verification, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 3) else if(conv_param.num_dim_spatial_ == 3)
{ {
return run_conv_fwd<3, using InLayout = ctc::GNDHWC;
ck::tensor_layout::convolution::NDHWGC, using WeiLayout = ctc::GKZYXC;
ck::tensor_layout::convolution::KZYXGC, using OutLayout = ctc::GNDHWK;
ck::tensor_layout::convolution::NDHWGK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
3,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<3>>(do_verification, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
......
...@@ -3,12 +3,15 @@ ...@@ -3,12 +3,15 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = float; using InDataType = float;
using WeiDataType = float; using WeiDataType = float;
using OutDataType = float;
using AccDataType = float; using AccDataType = float;
using CShuffleDataType = float;
using OutDataType = float;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NDimSpatial> static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
InDataType, // using DeviceGroupedConvNDFwdInstance =
WeiDataType, // ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
OutDataType, // NDimSpatial,
AccDataType, // InLayout,
InElementOp, // Input Elementwise Operation WeiLayout,
WeiElementOp, // Weights Elementwise Operation ck::Tuple<>,
OutElementOp, // Output Elementwise Operation OutLayout,
ConvFwdDefault, // ConvForwardSpecialization InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize 256, // BlockSize
256, // MPerBlock 128, // MPerBlock
128, // NPerBlock 256, // NPerBlock
4, // K0PerBlock 16, // KPerBlock
4, // K1 4, // AK1
32, // MPerXDL 4, // BK1
32, // NPerXDL 32, // MPerXdl
4, // MXdlPerWave 32, // NPerXdl
2, // NXdlPerWave 2, // MXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim 2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector 4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_K1 4, // ABlockTransferDstScalarPerVector_AK1
true, // ABlockLdsAddExtraM 1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector 4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_K1 4, // BBlockTransferDstScalarPerVector_BK1
true, // BBlockTransferAddExtraN 1, // BBlockLdsExtraN
7, // CThreadTransferSrcDstVectorDim 1,
1>; // CThreadTransferDstScalarPerVector 1,
S<1, 16, 1, 16>,
4>;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParam params{ ck::utils::conv::ConvParam conv_param{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; 2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1) if(argc == 1)
{ {
...@@ -84,71 +102,113 @@ int main(int argc, char* argv[]) ...@@ -84,71 +102,113 @@ int main(int argc, char* argv[])
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); const ck::index_t num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv); conv_param = parse_conv_param(num_dim_spatial, 5, argv);
} }
const auto in_element_op = InElementOp{}; const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1) if(conv_param.num_dim_spatial_ == 1)
{ {
return run_conv_fwd<1, using InLayout = ctc::GNWC;
ck::tensor_layout::convolution::NWC, using WeiLayout = ctc::GKXC;
ck::tensor_layout::convolution::KXC, using OutLayout = ctc::GNWK;
ck::tensor_layout::convolution::NWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
1,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<1>>(do_verification, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 2) else if(conv_param.num_dim_spatial_ == 2)
{ {
return run_conv_fwd<2, using InLayout = ctc::GNHWC;
ck::tensor_layout::convolution::NHWC, using WeiLayout = ctc::GKYXC;
ck::tensor_layout::convolution::KYXC, using OutLayout = ctc::GNHWK;
ck::tensor_layout::convolution::NHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
2,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<2>>(do_verification, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 3) else if(conv_param.num_dim_spatial_ == 3)
{ {
return run_conv_fwd<3, using InLayout = ctc::GNDHWC;
ck::tensor_layout::convolution::NDHWC, using WeiLayout = ctc::GKZYXC;
ck::tensor_layout::convolution::KZYXC, using OutLayout = ctc::GNDHWK;
ck::tensor_layout::convolution::NDHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
3,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<3>>(do_verification, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
......
...@@ -3,12 +3,15 @@ ...@@ -3,12 +3,15 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = double; using InDataType = double;
using WeiDataType = double; using WeiDataType = double;
using OutDataType = double;
using AccDataType = double; using AccDataType = double;
using CShuffleDataType = double;
using OutDataType = double;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NDimSpatial> static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
InDataType, // using DeviceGroupedConvNDFwdInstance =
WeiDataType, // ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
OutDataType, // NDimSpatial,
AccDataType, // InLayout,
InElementOp, // Input Elementwise Operation WeiLayout,
WeiElementOp, // Weights Elementwise Operation ck::Tuple<>,
OutElementOp, // Output Elementwise Operation OutLayout,
ConvFwdDefault, // ConvForwardSpecialization InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize 256, // BlockSize
128, // MPerBlock 128, // MPerBlock
128, // NPerBlock 128, // NPerBlock
4, // K0PerBlock 8, // KPerBlock
2, // K1 2, // AK1
16, // MPerXDL 2, // BK1
16, // NPerXDL 16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave 4, // MXdlPerWave
4, // NXdlPerWave 4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim 2, // ABlockTransferSrcVectorDim
2, // ABlockTransferSrcScalarPerVector 2, // ABlockTransferSrcScalarPerVector
2, // ABlockTransferDstScalarPerVector_K1 2, // ABlockTransferDstScalarPerVector_AK1
true, // ABlockLdsAddExtraM 1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferSrcVectorDim
2, // BBlockTransferSrcScalarPerVector 2, // BBlockTransferSrcScalarPerVector
2, // BBlockTransferDstScalarPerVector_K1 2, // BBlockTransferDstScalarPerVector_BK1
true, // BBlockTransferAddExtraN 1, // BBlockLdsExtraN
7, // CThreadTransferSrcDstVectorDim 1,
1>; // CThreadTransferDstScalarPerVector 1,
S<1, 16, 1, 16>,
1>;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParam params{ ck::utils::conv::ConvParam conv_param{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; 2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1) if(argc == 1)
{ {
...@@ -84,71 +102,113 @@ int main(int argc, char* argv[]) ...@@ -84,71 +102,113 @@ int main(int argc, char* argv[])
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); const ck::index_t num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv); conv_param = parse_conv_param(num_dim_spatial, 5, argv);
} }
const auto in_element_op = InElementOp{}; const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1) if(conv_param.num_dim_spatial_ == 1)
{ {
return run_conv_fwd<1, using InLayout = ctc::GNWC;
ck::tensor_layout::convolution::NWC, using WeiLayout = ctc::GKXC;
ck::tensor_layout::convolution::KXC, using OutLayout = ctc::GNWK;
ck::tensor_layout::convolution::NWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
1,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<1>>(do_verification, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 2) else if(conv_param.num_dim_spatial_ == 2)
{ {
return run_conv_fwd<2, using InLayout = ctc::GNHWC;
ck::tensor_layout::convolution::NHWC, using WeiLayout = ctc::GKYXC;
ck::tensor_layout::convolution::KYXC, using OutLayout = ctc::GNHWK;
ck::tensor_layout::convolution::NHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
2,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<2>>(do_verification, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 3) else if(conv_param.num_dim_spatial_ == 3)
{ {
return run_conv_fwd<3, using InLayout = ctc::GNDHWC;
ck::tensor_layout::convolution::NDHWC, using WeiLayout = ctc::GKZYXC;
ck::tensor_layout::convolution::KZYXC, using OutLayout = ctc::GNDHWK;
ck::tensor_layout::convolution::NDHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
3,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<3>>(do_verification, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
......
...@@ -3,12 +3,15 @@ ...@@ -3,12 +3,15 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = int8_t; using InDataType = int8_t;
using WeiDataType = int8_t; using WeiDataType = int8_t;
using OutDataType = int8_t;
using AccDataType = int32_t; using AccDataType = int32_t;
using CShuffleDataType = int8_t;
using OutDataType = int8_t;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -17,57 +20,72 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NDimSpatial> static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
NDimSpatial, // NDimSpatial template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
InDataType, // using DeviceGroupedConvNDFwdInstance =
WeiDataType, // ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
OutDataType, // NDimSpatial,
AccDataType, // InLayout,
InElementOp, // Input Elementwise Operation WeiLayout,
WeiElementOp, // Weights Elementwise Operation ck::Tuple<>,
OutElementOp, // Output Elementwise Operation OutLayout,
ConvFwdDefault, // ConvForwardSpecialization InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize 256, // BlockSize
128, // MPerBlock 128, // MPerBlock
256, // NPerBlock 256, // NPerBlock
4, // K0PerBlock 64, // KPerBlock
16, // K1 16, // AK1
16, // BK1
32, // MPerXdl 32, // MPerXdl
32, // NPerXdl 32, // NPerXdl
2, // MXdlPerWave 2, // MXdlPerWave
4, // NXdlPerWave 4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim 2, // ABlockTransferSrcVectorDim
16, // ABlockTransferSrcScalarPerVector 16, // ABlockTransferSrcScalarPerVector
16, // ABlockTransferDstScalarPerVector_K1 16, // ABlockTransferDstScalarPerVector_AK1
true, // ABlockLdsAddExtraM 1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferSrcVectorDim
16, // BBlockTransferSrcScalarPerVector 16, // BBlockTransferSrcScalarPerVector
16, // BBlockTransferDstScalarPerVector_K1 16, // BBlockTransferDstScalarPerVector_BK1
true, // BBlockLdsAddExtraN 1, // BBlockLdsExtraN
7, // CThreadTransferSrcDstVectorDim 1,
1>; // CThreadTransferDstScalarPerVector 1,
S<1, 64, 1, 4>,
16>;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParam params{ ck::utils::conv::ConvParam conv_param{
2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; 2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1) if(argc == 1)
{ {
...@@ -84,71 +102,113 @@ int main(int argc, char* argv[]) ...@@ -84,71 +102,113 @@ int main(int argc, char* argv[])
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); const ck::index_t num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv); conv_param = parse_conv_param(num_dim_spatial, 5, argv);
} }
const auto in_element_op = InElementOp{}; const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1) if(conv_param.num_dim_spatial_ == 1)
{ {
return run_conv_fwd<1, using InLayout = ctc::GNWC;
ck::tensor_layout::convolution::NWC, using WeiLayout = ctc::GKXC;
ck::tensor_layout::convolution::KXC, using OutLayout = ctc::GNWK;
ck::tensor_layout::convolution::NWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
1,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<1>>(do_verification, DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 2) else if(conv_param.num_dim_spatial_ == 2)
{ {
return run_conv_fwd<2, using InLayout = ctc::GNHWC;
ck::tensor_layout::convolution::NHWC, using WeiLayout = ctc::GKYXC;
ck::tensor_layout::convolution::KYXC, using OutLayout = ctc::GNHWK;
ck::tensor_layout::convolution::NHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
2,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<2>>(do_verification, DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
} }
else if(num_dim_spatial == 3) else if(conv_param.num_dim_spatial_ == 3)
{ {
return run_conv_fwd<3, using InLayout = ctc::GNDHWC;
ck::tensor_layout::convolution::NDHWC, using WeiLayout = ctc::GKZYXC;
ck::tensor_layout::convolution::KZYXC, using OutLayout = ctc::GNDHWK;
ck::tensor_layout::convolution::NDHWK,
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_grouped_conv_fwd<
3,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceConvNDFwdInstance<3>>(do_verification, DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, OutLayout>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void print_helper_msg() void print_helper_msg()
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = ck::half_t; using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
using AccDataType = float; using AccDataType = float;
...@@ -29,7 +31,7 @@ template <ck::index_t NDimSpatial, ...@@ -29,7 +31,7 @@ template <ck::index_t NDimSpatial,
typename WeiLayout, typename WeiLayout,
typename BiasLayout, typename BiasLayout,
typename OutLayout> typename OutLayout>
using DeviceGroupledConvNDFwdInstance = using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial, NDimSpatial,
InLayout, InLayout,
...@@ -52,24 +54,25 @@ using DeviceGroupledConvNDFwdInstance = ...@@ -52,24 +54,25 @@ using DeviceGroupledConvNDFwdInstance =
128, // MPerBlock 128, // MPerBlock
256, // NPerBlock 256, // NPerBlock
32, // KPerBlock 32, // KPerBlock
8, // K1 8, // AK1
8, // BK1
32, // MPerXdl 32, // MPerXdl
32, // NPerXdl 32, // NPerXdl
2, // MXdlPerWave 2, // MXdlPerWave
4, // NXdlPerWave 4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim 2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector 8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1 8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM 1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector 8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1 8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN 1, // BBlockLdsExtraN
1, 1,
1, 1,
...@@ -164,7 +167,7 @@ int main(int argc, char* argv[]) ...@@ -164,7 +167,7 @@ int main(int argc, char* argv[])
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceGroupledConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>( DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>(
do_verification, do_verification,
init_method, init_method,
time_kernel, time_kernel,
...@@ -251,7 +254,7 @@ int main(int argc, char* argv[]) ...@@ -251,7 +254,7 @@ int main(int argc, char* argv[])
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceGroupledConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>( DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>(
do_verification, do_verification,
init_method, init_method,
time_kernel, time_kernel,
...@@ -349,7 +352,7 @@ int main(int argc, char* argv[]) ...@@ -349,7 +352,7 @@ int main(int argc, char* argv[])
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceGroupledConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>( DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>(
do_verification, do_verification,
init_method, init_method,
time_kernel, time_kernel,
......
...@@ -248,7 +248,8 @@ template <index_t NDimSpatial, ...@@ -248,7 +248,8 @@ template <index_t NDimSpatial,
index_t MPerBlock, index_t MPerBlock,
index_t NPerBlock, index_t NPerBlock,
index_t KPerBlock, index_t KPerBlock,
index_t K1, index_t AK1,
index_t BK1,
index_t MPerXDL, index_t MPerXDL,
index_t NPerXDL, index_t NPerXDL,
index_t MXdlPerWave, index_t MXdlPerWave,
...@@ -295,9 +296,6 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle ...@@ -295,9 +296,6 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
static constexpr auto I2 = Number<2>{}; static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{}; static constexpr auto I3 = Number<3>{};
static constexpr auto K1Number = Number<K1>{};
static constexpr auto GemmK1Number = K1Number;
static constexpr auto matrix_padder = static constexpr auto matrix_padder =
MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock}; MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
...@@ -1256,8 +1254,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle ...@@ -1256,8 +1254,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
MPerBlock, MPerBlock,
NPerBlock, NPerBlock,
KPerBlock, KPerBlock,
K1, AK1,
K1, BK1,
MPerXDL, MPerXDL,
NPerXDL, NPerXDL,
MXdlPerWave, MXdlPerWave,
......
...@@ -39,11 +39,23 @@ struct PassThrough ...@@ -39,11 +39,23 @@ struct PassThrough
y = x; y = x;
} }
template <>
__host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
{
y = type_convert<bhalf_t>(x);
}
template <> template <>
__host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
{ {
y = x; y = x;
} }
template <>
__host__ __device__ void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
{
y = type_convert<int8_t>(x);
}
}; };
struct UnaryConvert struct UnaryConvert
......
...@@ -156,9 +156,9 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvPa ...@@ -156,9 +156,9 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvPa
param.filter_spatial_lengths_.begin(), param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> || else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> || ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKYXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>) ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKZYXC>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_), static_cast<std::size_t>(param.K_),
...@@ -183,6 +183,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvPa ...@@ -183,6 +183,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvPa
else else
{ {
printf("%s\n", __func__); printf("%s\n", __func__);
printf("%s\n", WeiLayout::name);
throw std::runtime_error("wrong! unsupported layout"); throw std::runtime_error("wrong! unsupported layout");
} }
...@@ -238,6 +239,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvP ...@@ -238,6 +239,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvP
else else
{ {
printf("%s\n", __func__); printf("%s\n", __func__);
printf("%s\n", OutLayout::name);
throw std::runtime_error("wrong! unsupported layout"); throw std::runtime_error("wrong! unsupported layout");
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment