"profiler/vscode:/vscode.git/clone" did not exist on "16ec937ebb898fcf7e8e72e9cf36e30553dd84b0"
Commit 3549e344 authored by Chao Liu's avatar Chao Liu
Browse files

adding group

parent c0bfcf91
...@@ -112,50 +112,14 @@ int run_conv_fwd(bool do_verification, ...@@ -112,50 +112,14 @@ int run_conv_fwd(bool do_verification,
const WeiElementOp& wei_element_op, const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op) const OutElementOp& out_element_op)
{ {
#if 0 const auto in_g_n_c_wis_desc =
const auto in_g_n_c_wis_desc = ck::utils::conv::get_input_host_tensor_descriptor<InLayout>(conv_param); ck::utils::conv::make_input_host_tensor_descriptor_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc = ck::utils::conv::get_weight_host_tensor_descriptor<WeiLayout>(conv_param); const auto wei_g_k_c_xs_desc =
const auto out_g_n_k_wos_desc = ck::utils::conv::get_output_host_tensor_descriptor<OutLayout>(conv_param); ck::utils::conv::make_weight_host_tensor_descriptor_packed<WeiLayout>(conv_param);
#else const auto bias_g_n_k_wos_desc =
const auto in_g_n_wis_c_desc = HostTensorDescriptor( ck::utils::conv::make_output_host_tensor_descriptor_packed<OutLayout>(conv_param);
std::vector<std::size_t>{static_cast<std::size_t>(conv_param.G_), const auto out_g_n_k_wos_desc =
static_cast<std::size_t>(conv_param.N_), ck::utils::conv::make_output_host_tensor_descriptor_packed<OutLayout>(conv_param);
static_cast<std::size_t>(conv_param.input_spatial_lengths_[0]),
static_cast<std::size_t>(conv_param.input_spatial_lengths_[1]),
static_cast<std::size_t>(conv_param.C_)});
const auto wei_g_k_xs_c_desc = HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(conv_param.G_),
static_cast<std::size_t>(conv_param.K_),
static_cast<std::size_t>(conv_param.filter_spatial_lengths_[0]),
static_cast<std::size_t>(conv_param.filter_spatial_lengths_[1]),
static_cast<std::size_t>(conv_param.C_)});
const auto bias_g_n_wos_k_desc = HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(conv_param.G_),
static_cast<std::size_t>(conv_param.N_),
static_cast<std::size_t>(conv_param.output_spatial_lengths_[0]),
static_cast<std::size_t>(conv_param.output_spatial_lengths_[1]),
static_cast<std::size_t>(conv_param.K_)},
std::vector<std::size_t>{0, 0, 0, 0, 1});
const auto out_g_n_wos_k_desc = HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(conv_param.G_),
static_cast<std::size_t>(conv_param.N_),
static_cast<std::size_t>(conv_param.output_spatial_lengths_[0]),
static_cast<std::size_t>(conv_param.output_spatial_lengths_[1]),
static_cast<std::size_t>(conv_param.K_)});
// tensor descriptor in NCHW/KXYC/NKHW dimensional order
const auto in_g_n_c_wis_desc = transpose_host_tensor_descriptor_given_new2old(
in_g_n_wis_c_desc, std::vector<ck::index_t>{0, 1, 4, 2, 3});
const auto wei_g_k_c_xs_desc = transpose_host_tensor_descriptor_given_new2old(
wei_g_k_xs_c_desc, std::vector<ck::index_t>{0, 1, 4, 2, 3});
const auto bias_g_n_k_wos_desc = transpose_host_tensor_descriptor_given_new2old(
bias_g_n_wos_k_desc, std::vector<ck::index_t>{0, 1, 4, 2, 3});
const auto out_g_n_k_wos_desc = transpose_host_tensor_descriptor_given_new2old(
out_g_n_wos_k_desc, std::vector<ck::index_t>{0, 1, 4, 2, 3});
#endif
Tensor<InDataType> in(in_g_n_c_wis_desc); Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc); Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
...@@ -267,9 +231,6 @@ int run_conv_fwd(bool do_verification, ...@@ -267,9 +231,6 @@ int run_conv_fwd(bool do_verification,
Tensor<OutDataType> c_host(out_g_n_k_wos_desc); Tensor<OutDataType> c_host(out_g_n_k_wos_desc);
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
...@@ -291,24 +252,9 @@ int run_conv_fwd(bool do_verification, ...@@ -291,24 +252,9 @@ int run_conv_fwd(bool do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
for(int g = 0; g < out_host.mDesc.GetLengths()[0]; g++) // TODO: implement elementwise operation for host
{ out_host.ForEach(
for(int n = 0; n < out_host.mDesc.GetLengths()[1]; n++) [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
{
for(int k = 0; k < out_host.mDesc.GetLengths()[2]; k++)
{
for(int ho = 0; ho < out_host.mDesc.GetLengths()[3]; ho++)
{
for(int wo = 0; wo < out_host.mDesc.GetLengths()[4]; wo++)
{
out_element_op(out_host(g, n, k, ho, wo),
c_host(g, n, k, ho, wo),
bias(g, n, k, ho, wo));
}
}
}
}
}
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
......
...@@ -71,22 +71,41 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio ...@@ -71,22 +71,41 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvFwdMultipleD_Xdl_CShuffle< using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial, NDimSpatial,
#if 0
ck::tuple_element_t<NDimSpatial - 1, ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWC, ck::Tuple<ck::tensor_layout::convolution::G_NW_C,
ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::G_NHW_C,
ck::tensor_layout::convolution::NDHWC>>, ck::tensor_layout::convolution::G_NDHW_C>>,
ck::tuple_element_t<NDimSpatial - 1, ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::KXC, ck::Tuple<ck::tensor_layout::convolution::G_K_X_C,
ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::G_K_YX_C,
ck::tensor_layout::convolution::KZYXC>>, ck::tensor_layout::convolution::G_K_ZYX_C>>,
ck::Tuple<ck::tuple_element_t<NDimSpatial - 1, ck::Tuple<ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NW_K, ck::Tuple<ck::tensor_layout::convolution::G_NW_K,
ck::tensor_layout::convolution::NHW_K, ck::tensor_layout::convolution::G_NHW_K,
ck::tensor_layout::convolution::NDHW_K>>>, ck::tensor_layout::convolution::G_NDHW_K>>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::G_NW_K,
ck::tensor_layout::convolution::G_NHW_K,
ck::tensor_layout::convolution::G_NDHW_K>>,
#else
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWGC,
ck::tensor_layout::convolution::NHWGC,
ck::tensor_layout::convolution::NDHWGC>>,
ck::tuple_element_t<NDimSpatial - 1, ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWK, ck::Tuple<ck::tensor_layout::convolution::KXGC,
ck::tensor_layout::convolution::NHWK, ck::tensor_layout::convolution::KYXGC,
ck::tensor_layout::convolution::NDHWK>>, ck::tensor_layout::convolution::KZYXGC>>,
ck::Tuple<ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWGK,
ck::tensor_layout::convolution::NHWGK,
ck::tensor_layout::convolution::NDHWGK>>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWGK,
ck::tensor_layout::convolution::NHWGK,
ck::tensor_layout::convolution::NDHWGK>>,
#endif
InDataType, InDataType,
WeiDataType, WeiDataType,
AccDataType, AccDataType,
...@@ -167,9 +186,9 @@ int main(int argc, char* argv[]) ...@@ -167,9 +186,9 @@ int main(int argc, char* argv[])
if(num_dim_spatial == 1) if(num_dim_spatial == 1)
{ {
return run_conv_fwd<1, return run_conv_fwd<1,
ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::NWGC,
ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::KXGC,
ck::tensor_layout::convolution::NWK, ck::tensor_layout::convolution::NWGK,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
...@@ -187,9 +206,9 @@ int main(int argc, char* argv[]) ...@@ -187,9 +206,9 @@ int main(int argc, char* argv[])
else if(num_dim_spatial == 2) else if(num_dim_spatial == 2)
{ {
return run_conv_fwd<2, return run_conv_fwd<2,
ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::NHWGC,
ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::KYXGC,
ck::tensor_layout::convolution::NHWK, ck::tensor_layout::convolution::NHWGK,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
...@@ -207,9 +226,9 @@ int main(int argc, char* argv[]) ...@@ -207,9 +226,9 @@ int main(int argc, char* argv[])
else if(num_dim_spatial == 3) else if(num_dim_spatial == 3)
{ {
return run_conv_fwd<3, return run_conv_fwd<3,
ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::NDHWGC,
ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::KZYXGC,
ck::tensor_layout::convolution::NDHWK, ck::tensor_layout::convolution::NDHWGK,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
......
...@@ -216,9 +216,9 @@ __global__ void ...@@ -216,9 +216,9 @@ __global__ void
// //
// Supports: // Supports:
// @li Forward convolution with up to 3 spatial dimentions // @li Forward convolution with up to 3 spatial dimentions
// @li Input tensor in NWC data format // @li Input tensor in GNWC data format
// @li Weight tensor in KXC data format // @li Weight tensor in GKXC data format
// @li Output tensor in NWK data format // @li Output tensor in GNWK data format
// //
// 1D: // 1D:
// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C] // out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
...@@ -302,7 +302,7 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -302,7 +302,7 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
template <typename ALay, template <typename ALay,
typename std::enable_if<NDimSpatial == 1 && typename std::enable_if<NDimSpatial == 1 &&
is_same_v<ALay, tensor_layout::convolution::NWC>, is_same_v<ALay, tensor_layout::convolution::GNWC>,
bool>::type = false> bool>::type = false>
static auto static auto
MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
...@@ -319,16 +319,6 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -319,16 +319,6 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t N = a_g_n_c_wis_lengths[1]; const index_t N = a_g_n_c_wis_lengths[1];
const index_t C = a_g_n_c_wis_lengths[2]; const index_t C = a_g_n_c_wis_lengths[2];
const index_t GemmMRaw = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const index_t GemmKRaw = C * std::accumulate(b_g_k_c_xs_lengths.begin() + 3,
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const index_t Wi = a_g_n_c_wis_lengths[3]; const index_t Wi = a_g_n_c_wis_lengths[3];
const index_t Wo = e_g_n_k_wos_lengths[3]; const index_t Wo = e_g_n_k_wos_lengths[3];
...@@ -338,8 +328,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -338,8 +328,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
{ {
const index_t NWo = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const auto in_gemmmraw_gemmk_grid_desc = const auto in_gemmmraw_gemmk_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(GemmMRaw, GemmKRaw)); make_naive_tensor_descriptor_packed(make_tuple(NWo, C));
const auto in_gemmm_gemmk_grid_desc = const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmk_grid_desc); matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmk_grid_desc);
...@@ -414,7 +409,7 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -414,7 +409,7 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
template <typename ALay, template <typename ALay,
typename std::enable_if<NDimSpatial == 2 && typename std::enable_if<NDimSpatial == 2 &&
is_same_v<ALay, tensor_layout::convolution::NHWC>, is_same_v<ALay, tensor_layout::convolution::GNHWC>,
bool>::type = false> bool>::type = false>
static auto static auto
MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
...@@ -431,16 +426,6 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -431,16 +426,6 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t N = a_g_n_c_wis_lengths[1]; const index_t N = a_g_n_c_wis_lengths[1];
const index_t C = a_g_n_c_wis_lengths[2]; const index_t C = a_g_n_c_wis_lengths[2];
const index_t GemmMRaw = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const index_t GemmKRaw = C * std::accumulate(b_g_k_c_xs_lengths.begin() + 3,
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const index_t Hi = a_g_n_c_wis_lengths[3]; const index_t Hi = a_g_n_c_wis_lengths[3];
const index_t Wi = a_g_n_c_wis_lengths[4]; const index_t Wi = a_g_n_c_wis_lengths[4];
...@@ -453,8 +438,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -453,8 +438,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
{ {
const index_t NHoWo = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const auto in_gemmmraw_gemmkraw_grid_desc = const auto in_gemmmraw_gemmkraw_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(GemmMRaw, GemmKRaw)); make_naive_tensor_descriptor_packed(make_tuple(NHoWo, C));
const auto in_gemmm_gemmk_grid_desc = const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc); matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc);
...@@ -539,8 +529,152 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -539,8 +529,152 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
} }
template <typename ALay, template <typename ALay,
typename std::enable_if<NDimSpatial == 2 && typename std::enable_if<NDimSpatial == 3 &&
is_same_v<ALay, tensor_layout::convolution::G_N_HW_C>, is_same_v<ALay, tensor_layout::convolution::GNDHWC>,
bool>::type = false>
static auto
MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads)
{
const index_t N = a_g_n_c_wis_lengths[1];
const index_t C = a_g_n_c_wis_lengths[2];
const index_t Di = a_g_n_c_wis_lengths[3];
const index_t Hi = a_g_n_c_wis_lengths[4];
const index_t Wi = a_g_n_c_wis_lengths[5];
const index_t Do = e_g_n_k_wos_lengths[3];
const index_t Ho = e_g_n_k_wos_lengths[4];
const index_t Wo = e_g_n_k_wos_lengths[5];
const index_t ConvStrideD = conv_filter_strides[0];
const index_t ConvStrideH = conv_filter_strides[1];
const index_t ConvStrideW = conv_filter_strides[2];
if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
{
const index_t NDoHoWo =
N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const auto in_gemmmraw_gemmkraw_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(NDoHoWo, C));
const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc);
return in_gemmm_gemmk_grid_desc;
}
else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Pad0)
{
const auto in_n_di_hi_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
const auto in_n_do_ho_wo_c_grid_desc = transform_tensor_descriptor(
in_n_di_hi_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
const auto in_gemmmraw_gemmkraw_grid_desc = transform_tensor_descriptor(
in_n_do_ho_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
make_pass_through_transform(C)),
make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc);
return in_gemmm_gemmk_grid_desc;
}
else
{
const index_t Z = b_g_k_c_xs_lengths[3];
const index_t Y = b_g_k_c_xs_lengths[4];
const index_t X = b_g_k_c_xs_lengths[5];
const index_t ConvDilationD = conv_filter_dilations[0];
const index_t ConvDilationH = conv_filter_dilations[1];
const index_t ConvDilationW = conv_filter_dilations[2];
const index_t InLeftPadD = input_left_pads[0];
const index_t InLeftPadH = input_left_pads[1];
const index_t InLeftPadW = input_left_pads[2];
const index_t InRightPadD = input_right_pads[0];
const index_t InRightPadH = input_right_pads[1];
const index_t InRightPadW = input_right_pads[2];
const auto in_n_di_hi_wi_c_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
in_n_di_hi_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_pad_transform(Di, InLeftPadD, InRightPadD),
make_pad_transform(Hi, InLeftPadH, InRightPadH),
make_pad_transform(Wi, InLeftPadW, InRightPadW),
make_pass_through_transform(C)),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
in_n_hip_wip_c_grid_desc,
make_tuple(
make_pass_through_transform(N),
make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{},
Sequence<1, 2>{},
Sequence<3, 4>{},
Sequence<5, 6>{},
Sequence<7>{}));
const auto in_gemmmraw_gemmkraw_grid_desc = transform_tensor_descriptor(
in_n_z_do_y_ho_x_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
make_merge_transform(make_tuple(Z, Y, X, C))),
make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc);
return in_gemmm_gemmk_grid_desc;
}
}
// TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as
// properties
template <typename ALay,
typename std::enable_if<NDimSpatial == 1 &&
(is_same_v<ALay, tensor_layout::convolution::G_NW_C> ||
is_same_v<ALay, tensor_layout::convolution::NWGC>),
bool>::type = false> bool>::type = false>
static auto static auto
MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
...@@ -557,15 +691,127 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -557,15 +691,127 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t N = a_g_n_c_wis_lengths[1]; const index_t N = a_g_n_c_wis_lengths[1];
const index_t C = a_g_n_c_wis_lengths[2]; const index_t C = a_g_n_c_wis_lengths[2];
const index_t GemmMRaw = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3, const index_t Wi = a_g_n_c_wis_lengths[3];
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1}, const index_t Wo = e_g_n_k_wos_lengths[3];
std::multiplies<index_t>());
const index_t ConvStrideW = conv_filter_strides[0];
if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
{
const index_t NHoWo = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
// This is different
const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial];
const auto CStride = I1;
const auto in_gemmmraw_gemmk_grid_desc =
make_naive_tensor_descriptor(make_tuple(NHoWo, C), make_tuple(WiStride, CStride));
const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmk_grid_desc);
return in_gemmm_gemmk_grid_desc;
}
else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Pad0)
{
// This is different
const index_t NStride = a_g_n_c_wis_strides[1];
const index_t WiStride = a_g_n_c_wis_strides[3];
const auto CStride = I1;
const auto in_n_wi_c_grid_desc = make_naive_tensor_descriptor(
make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride));
const auto in_n_wo_c_grid_desc = transform_tensor_descriptor(
in_n_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
const auto in_gemmmraw_gemmkraw_grid_desc = transform_tensor_descriptor(
in_n_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)),
make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc);
return in_gemmm_gemmk_grid_desc;
}
else
{
const index_t X = b_g_k_c_xs_lengths[3];
const index_t ConvDilationW = conv_filter_dilations[0];
const index_t InLeftPadW = input_left_pads[0];
const index_t InRightPadW = input_right_pads[0];
// This is different
const index_t NStride = a_g_n_c_wis_strides[1];
const index_t WiStride = a_g_n_c_wis_strides[3];
const auto CStride = I1;
const auto in_n_wi_c_grid_desc = make_naive_tensor_descriptor(
make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride));
const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
in_n_wi_c_grid_desc,
make_tuple(make_pass_through_transform(N),
make_pad_transform(Wi, InLeftPadW, InRightPadW),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
const auto in_n_x_wo_c_grid_desc = transform_tensor_descriptor(
in_n_wip_c_grid_desc,
make_tuple(
make_pass_through_transform(N),
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
const auto in_gemmmraw_gemmk_grid_desc =
transform_tensor_descriptor(in_n_x_wo_c_grid_desc,
make_tuple(make_merge_transform(make_tuple(N, Wo)),
make_merge_transform(make_tuple(X, C))),
make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmk_grid_desc);
const index_t GemmKRaw = C * std::accumulate(b_g_k_c_xs_lengths.begin() + 3, return in_gemmm_gemmk_grid_desc;
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial, }
index_t{1}, }
std::multiplies<index_t>());
template <typename ALay,
typename std::enable_if<NDimSpatial == 2 &&
(is_same_v<ALay, tensor_layout::convolution::G_NHW_C> ||
is_same_v<ALay, tensor_layout::convolution::NHWGC>),
bool>::type = false>
static auto
MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads)
{
const index_t N = a_g_n_c_wis_lengths[1];
const index_t C = a_g_n_c_wis_lengths[2];
const index_t Hi = a_g_n_c_wis_lengths[3]; const index_t Hi = a_g_n_c_wis_lengths[3];
const index_t Wi = a_g_n_c_wis_lengths[4]; const index_t Wi = a_g_n_c_wis_lengths[4];
...@@ -579,12 +825,17 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -579,12 +825,17 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
{ {
const index_t NHoWo = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
// This is different // This is different
const index_t CStride = a_g_n_c_wis_strides[2]; const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial];
const index_t WStride = a_g_n_c_wis_strides[2+NDimSpatial]; const auto CStride = I1;
const auto in_gemmmraw_gemmkraw_grid_desc = const auto in_gemmmraw_gemmkraw_grid_desc =
make_naive_tensor_descriptor(make_tuple(GemmMRaw, GemmKRaw), make_tuple(WStride, CStride); make_naive_tensor_descriptor(make_tuple(NHoWo, C), make_tuple(WiStride, CStride));
const auto in_gemmm_gemmk_grid_desc = const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc); matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc);
...@@ -595,12 +846,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -595,12 +846,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
ConvolutionForwardSpecialization::Filter1x1Pad0) ConvolutionForwardSpecialization::Filter1x1Pad0)
{ {
// This is different // This is different
const auto in_n_hi_wi_c_grid_desc = const index_t NStride = a_g_n_c_wis_strides[1];
make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C), const index_t HiStride = a_g_n_c_wis_strides[3];
make_tuple(a_g_n_c_wis_srides[1], const index_t WiStride = a_g_n_c_wis_strides[4];
a_g_n_c_wis_srides[3], const auto CStride = I1;
a_g_n_c_wis_srides[4],
a_g_n_c_wis_srides[2])); const auto in_n_hi_wi_c_grid_desc = make_naive_tensor_descriptor(
make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor( const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor(
in_n_hi_wi_c_grid_desc, in_n_hi_wi_c_grid_desc,
...@@ -638,12 +890,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -638,12 +890,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t InRightPadW = input_right_pads[1]; const index_t InRightPadW = input_right_pads[1];
// This is different // This is different
const auto in_n_hi_wi_c_grid_desc = const index_t NStride = a_g_n_c_wis_strides[1];
make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C), const index_t HiStride = a_g_n_c_wis_strides[3];
make_tuple(a_g_n_c_wis_srides[1], const index_t WiStride = a_g_n_c_wis_strides[4];
a_g_n_c_wis_srides[3], const auto CStride = I1;
a_g_n_c_wis_srides[4],
a_g_n_c_wis_srides[2])); const auto in_n_hi_wi_c_grid_desc = make_naive_tensor_descriptor(
make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
in_n_hi_wi_c_grid_desc, in_n_hi_wi_c_grid_desc,
...@@ -680,7 +933,8 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -680,7 +933,8 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
template <typename ALay, template <typename ALay,
typename std::enable_if<NDimSpatial == 3 && typename std::enable_if<NDimSpatial == 3 &&
is_same_v<ALay, tensor_layout::convolution::NDHWC>, (is_same_v<ALay, tensor_layout::convolution::G_NDHW_C> ||
is_same_v<ALay, tensor_layout::convolution::NDHWGC>),
bool>::type = false> bool>::type = false>
static auto static auto
MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
...@@ -697,16 +951,6 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -697,16 +951,6 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t N = a_g_n_c_wis_lengths[1]; const index_t N = a_g_n_c_wis_lengths[1];
const index_t C = a_g_n_c_wis_lengths[2]; const index_t C = a_g_n_c_wis_lengths[2];
const index_t GemmMRaw = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const index_t GemmKRaw = C * std::accumulate(b_g_k_c_xs_lengths.begin() + 3,
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const index_t Di = a_g_n_c_wis_lengths[3]; const index_t Di = a_g_n_c_wis_lengths[3];
const index_t Hi = a_g_n_c_wis_lengths[4]; const index_t Hi = a_g_n_c_wis_lengths[4];
const index_t Wi = a_g_n_c_wis_lengths[5]; const index_t Wi = a_g_n_c_wis_lengths[5];
...@@ -722,8 +966,18 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -722,8 +966,18 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
{ {
const index_t NDoHoWo =
N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
// This is different
const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial];
const auto CStride = I1;
const auto in_gemmmraw_gemmkraw_grid_desc = const auto in_gemmmraw_gemmkraw_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(GemmMRaw, GemmKRaw)); make_naive_tensor_descriptor(make_tuple(NDoHoWo, C), make_tuple(WiStride, CStride));
const auto in_gemmm_gemmk_grid_desc = const auto in_gemmm_gemmk_grid_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc); matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_grid_desc);
...@@ -733,8 +987,16 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -733,8 +987,16 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
else if constexpr(ConvForwardSpecialization == else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Pad0) ConvolutionForwardSpecialization::Filter1x1Pad0)
{ {
const auto in_n_di_hi_wi_c_grid_desc = // This is different
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C)); const index_t NStride = a_g_n_c_wis_strides[1];
const index_t DiStride = a_g_n_c_wis_strides[3];
const index_t HiStride = a_g_n_c_wis_strides[4];
const index_t WiStride = a_g_n_c_wis_strides[5];
const auto CStride = I1;
const auto in_n_di_hi_wi_c_grid_desc = make_naive_tensor_descriptor(
make_tuple(N, Di, Hi, Wi, C),
make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
const auto in_n_do_ho_wo_c_grid_desc = transform_tensor_descriptor( const auto in_n_do_ho_wo_c_grid_desc = transform_tensor_descriptor(
in_n_di_hi_wi_c_grid_desc, in_n_di_hi_wi_c_grid_desc,
...@@ -778,8 +1040,16 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -778,8 +1040,16 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t InRightPadH = input_right_pads[1]; const index_t InRightPadH = input_right_pads[1];
const index_t InRightPadW = input_right_pads[2]; const index_t InRightPadW = input_right_pads[2];
const auto in_n_di_hi_wi_c_grid_desc = // This is different
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C)); const index_t NStride = a_g_n_c_wis_strides[1];
const index_t DiStride = a_g_n_c_wis_strides[3];
const index_t HiStride = a_g_n_c_wis_strides[4];
const index_t WiStride = a_g_n_c_wis_strides[5];
const auto CStride = I1;
const auto in_n_di_hi_wi_c_grid_desc = make_naive_tensor_descriptor(
make_tuple(N, Di, Hi, Wi, C),
make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
in_n_di_hi_wi_c_grid_desc, in_n_di_hi_wi_c_grid_desc,
...@@ -824,9 +1094,9 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -824,9 +1094,9 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
} }
template <typename BLay, template <typename BLay,
typename std::enable_if<is_same_v<BLay, tensor_layout::convolution::KXC> || typename std::enable_if<is_same_v<BLay, tensor_layout::convolution::GKXC> ||
is_same_v<BLay, tensor_layout::convolution::KYXC> || is_same_v<BLay, tensor_layout::convolution::GKYXC> ||
is_same_v<BLay, tensor_layout::convolution::KZYXC>, is_same_v<BLay, tensor_layout::convolution::GKZYXC>,
bool>::type = false> bool>::type = false>
static auto static auto
MakeBGridDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, MakeBGridDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
...@@ -835,15 +1105,12 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -835,15 +1105,12 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t K = b_g_k_c_xs_lengths[1]; const index_t K = b_g_k_c_xs_lengths[1];
const index_t C = b_g_k_c_xs_lengths[2]; const index_t C = b_g_k_c_xs_lengths[2];
const index_t GemmNRaw = K; const index_t YX = std::accumulate(b_g_k_c_xs_lengths.begin() + 3,
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial,
const index_t GemmKRaw = C * std::accumulate(b_g_k_c_xs_lengths.begin() + 3, index_t{1},
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial, std::multiplies<index_t>());
index_t{1},
std::multiplies<index_t>());
const auto wei_k_yxc_grid_desc = const auto wei_k_yxc_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(K, YX * C));
make_naive_tensor_descriptor_packed(make_tuple(GemmNRaw, GemmKRaw));
const auto wei_gemmn_gemmk_grid_desc = const auto wei_gemmn_gemmk_grid_desc =
matrix_padder.PadBDescriptor_N_K(wei_k_yxc_grid_desc); matrix_padder.PadBDescriptor_N_K(wei_k_yxc_grid_desc);
...@@ -854,7 +1121,10 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -854,7 +1121,10 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
template <typename BLay, template <typename BLay,
typename std::enable_if<is_same_v<BLay, tensor_layout::convolution::G_K_X_C> || typename std::enable_if<is_same_v<BLay, tensor_layout::convolution::G_K_X_C> ||
is_same_v<BLay, tensor_layout::convolution::G_K_YX_C> || is_same_v<BLay, tensor_layout::convolution::G_K_YX_C> ||
is_same_v<BLay, tensor_layout::convolution::G_K_ZYX_C>, is_same_v<BLay, tensor_layout::convolution::G_K_ZYX_C> ||
is_same_v<BLay, tensor_layout::convolution::KXGC> ||
is_same_v<BLay, tensor_layout::convolution::KYXGC> ||
is_same_v<BLay, tensor_layout::convolution::KZYXGC>,
bool>::type = false> bool>::type = false>
static auto static auto
MakeBGridDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, MakeBGridDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
...@@ -863,26 +1133,34 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -863,26 +1133,34 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t K = b_g_k_c_xs_lengths[1]; const index_t K = b_g_k_c_xs_lengths[1];
const index_t C = b_g_k_c_xs_lengths[2]; const index_t C = b_g_k_c_xs_lengths[2];
const index_t GemmNRaw = K; const index_t YX = std::accumulate(b_g_k_c_xs_lengths.begin() + 3,
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial,
index_t{1},
std::multiplies<index_t>());
const index_t GemmKRaw = C * std::accumulate(b_g_k_c_xs_lengths.begin() + 3, const index_t KStride = b_g_k_c_xs_strides[1];
b_g_k_c_xs_lengths.begin() + 3 + NDimSpatial, const index_t XStride = b_g_k_c_xs_strides[2 + NDimSpatial];
index_t{1}, const auto CStride = I1;
std::multiplies<index_t>());
const auto wei_k_yxc_grid_desc = const auto wei_k_yx_c_grid_desc = make_naive_tensor_descriptor(
make_naive_tensor_descriptor_packed(make_tuple(GemmNRaw, GemmKRaw)); make_tuple(K, YX, C), make_tuple(KStride, XStride, CStride));
const auto wei_gemmnraw_gemmkraw_grid_desc = transform_tensor_descriptor(
wei_k_yx_c_grid_desc,
make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(YX, C))),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const auto wei_gemmn_gemmk_grid_desc = const auto wei_gemmn_gemmk_grid_desc =
matrix_padder.PadBDescriptor_N_K(wei_k_yxc_grid_desc); matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_grid_desc);
return wei_gemmn_gemmk_grid_desc; return wei_gemmn_gemmk_grid_desc;
} }
template <typename ELay, template <typename ELay,
typename std::enable_if<is_same_v<ELay, tensor_layout::convolution::NWK> || typename std::enable_if<is_same_v<ELay, tensor_layout::convolution::GNWK> ||
is_same_v<ELay, tensor_layout::convolution::NHWK> || is_same_v<ELay, tensor_layout::convolution::GNHWK> ||
is_same_v<ELay, tensor_layout::convolution::NDHWK>, is_same_v<ELay, tensor_layout::convolution::GNDHWK>,
bool>::type = false> bool>::type = false>
static auto static auto
MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
...@@ -891,15 +1169,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -891,15 +1169,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
const index_t N = e_g_n_k_wos_lengths[1]; const index_t N = e_g_n_k_wos_lengths[1];
const index_t K = e_g_n_k_wos_lengths[2]; const index_t K = e_g_n_k_wos_lengths[2];
const index_t GemmMRaw = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3, const index_t NHoWo = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial, e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1}, index_t{1},
std::multiplies<index_t>()); std::multiplies<index_t>());
const index_t GemmNRaw = K;
const auto out_gemmmraw_gemmnraw_grid_desc = const auto out_gemmmraw_gemmnraw_grid_desc =
make_naive_tensor_descriptor_packed(make_tuple(GemmMRaw, GemmNRaw)); make_naive_tensor_descriptor_packed(make_tuple(NHoWo, K));
const auto out_gemmm_gemmn_grid_desc = const auto out_gemmm_gemmn_grid_desc =
matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_grid_desc); matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_grid_desc);
...@@ -908,30 +1184,30 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -908,30 +1184,30 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
} }
template <typename ELay, template <typename ELay,
typename std::enable_if<is_same_v<ELay, tensor_layout::convolution::NW_K> || typename std::enable_if<is_same_v<ELay, tensor_layout::convolution::G_NW_K> ||
is_same_v<ELay, tensor_layout::convolution::NHW_K> || is_same_v<ELay, tensor_layout::convolution::G_NHW_K> ||
is_same_v<ELay, tensor_layout::convolution::NDHW_K>, is_same_v<ELay, tensor_layout::convolution::G_NDHW_K> ||
is_same_v<ELay, tensor_layout::convolution::NWGK> ||
is_same_v<ELay, tensor_layout::convolution::NHWGK> ||
is_same_v<ELay, tensor_layout::convolution::NDHWGK>,
bool>::type = false> bool>::type = false>
static auto static auto
MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths, MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides) const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides)
{ {
namespace ctc = ck::tensor_layout::convolution;
const index_t N = e_g_n_k_wos_lengths[1]; const index_t N = e_g_n_k_wos_lengths[1];
const index_t K = e_g_n_k_wos_lengths[2]; const index_t K = e_g_n_k_wos_lengths[2];
const auto KStride = I1;
const index_t WoStride = e_g_n_k_wos_strides[NDimSpatial + 2]; const index_t WoStride = e_g_n_k_wos_strides[NDimSpatial + 2];
const index_t GemmMRaw = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3, const index_t NHoWo = N * std::accumulate(e_g_n_k_wos_lengths.begin() + 3,
e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial, e_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
index_t{1}, index_t{1},
std::multiplies<index_t>()); std::multiplies<index_t>());
const index_t GemmNRaw = K;
const auto out_gemmmraw_gemmnraw_grid_desc = const auto out_gemmmraw_gemmnraw_grid_desc =
make_naive_tensor_descriptor(make_tuple(GemmMRaw, GemmNRaw), make_tuple(WoStride, I1)); make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(WoStride, KStride));
const auto out_gemmm_gemmn_grid_desc = const auto out_gemmm_gemmn_grid_desc =
matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_grid_desc); matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_grid_desc);
...@@ -1342,8 +1618,12 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -1342,8 +1618,12 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
printf("%d\n", itmp++); printf("%d\n", itmp++);
// check vector access of A // check vector access of A
if constexpr(is_same_v<ALayout, ctc::NWC> || is_same_v<ALayout, ctc::NHWC> || // FIXME: layout
is_same_v<ALayout, ctc::NDHWC>) if constexpr(is_same_v<ALayout, ctc::G_NW_C> || is_same_v<ALayout, ctc::G_NHW_C> ||
is_same_v<ALayout, ctc::G_NDHW_C> || is_same_v<ALayout, ctc::GNWC> ||
is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
is_same_v<ALayout, ctc::NDHWGC>)
{ {
const index_t C = arg.a_g_n_c_wis_lengths_[2]; const index_t C = arg.a_g_n_c_wis_lengths_[2];
...@@ -1360,8 +1640,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -1360,8 +1640,13 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
printf("%d\n", itmp++); printf("%d\n", itmp++);
// check vector access of B // check vector access of B
if constexpr(is_same_v<BLayout, ctc::KXC> || is_same_v<BLayout, ctc::KYXC> || // FIXME: layout
is_same_v<BLayout, ctc::KZYXC>) if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
is_same_v<BLayout, ctc::KZYXGC>)
{ {
const index_t C = arg.b_g_k_c_xs_lengths_[2]; const index_t C = arg.b_g_k_c_xs_lengths_[2];
...@@ -1383,9 +1668,12 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -1383,9 +1668,12 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
static_for<0, NumDTensor, 1>{}([&](auto i) { static_for<0, NumDTensor, 1>{}([&](auto i) {
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>; using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
if constexpr(is_same_v<DLayout, ctc::NWK> || is_same_v<DLayout, ctc::NHWK> || // FIXME: layout
is_same_v<DLayout, ctc::NDHWK> || is_same_v<DLayout, ctc::NW_K> || if constexpr(is_same_v<DLayout, ctc::G_NW_K> || is_same_v<DLayout, ctc::G_NHW_K> ||
is_same_v<DLayout, ctc::NHW_K> || is_same_v<DLayout, ctc::NDHW_K>) is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
is_same_v<DLayout, ctc::NDHWGK>)
{ {
const index_t K = arg.ds_g_n_k_wos_lengths_[i][2]; const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
...@@ -1408,8 +1696,11 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS ...@@ -1408,8 +1696,11 @@ struct DeviceConvFwdMultipleD_Xdl_CShuffle : public DeviceConvFwdMultipleD<NDimS
printf("%d\n", itmp++); printf("%d\n", itmp++);
// check vector access of E // check vector access of E
if constexpr(is_same_v<ELayout, ctc::NWK> || is_same_v<ELayout, ctc::NHWK> || if constexpr(is_same_v<ELayout, ctc::G_NW_K> || is_same_v<ELayout, ctc::G_NHW_K> ||
is_same_v<ELayout, ctc::NDHWK>) is_same_v<ELayout, ctc::G_NDHW_K> || is_same_v<ELayout, ctc::GNWK> ||
is_same_v<ELayout, ctc::GNHWK> || is_same_v<ELayout, ctc::GNDHWK> ||
is_same_v<ELayout, ctc::NWGK> || is_same_v<ELayout, ctc::NHWGK> ||
is_same_v<ELayout, ctc::NDHWGK>)
{ {
const index_t K = arg.e_g_n_k_wos_lengths_[2]; const index_t K = arg.e_g_n_k_wos_lengths_[2];
......
...@@ -25,6 +25,39 @@ struct ColumnMajor : public BaseTensorLayout ...@@ -25,6 +25,39 @@ struct ColumnMajor : public BaseTensorLayout
namespace convolution { namespace convolution {
// input tensor
// packed NCW/NCHW/NCDHW
struct NCW : public BaseTensorLayout
{
static constexpr const char* name = "NCW";
};
struct NCHW : public BaseTensorLayout
{
static constexpr const char* name = "NCHW";
};
struct NCDHW : public BaseTensorLayout
{
static constexpr const char* name = "NCDHW";
};
// packed GNCW/GNCHW/GNCDHW
struct GNCW : public BaseTensorLayout
{
static constexpr const char* name = "GNCW";
};
struct GNCHW : public BaseTensorLayout
{
static constexpr const char* name = "GNCHW";
};
struct GNCDHW : public BaseTensorLayout
{
static constexpr const char* name = "GNCDHW";
};
// input tensor // input tensor
// packed NWC/NHWC/NDHWC // packed NWC/NHWC/NDHWC
struct NWC : public BaseTensorLayout struct NWC : public BaseTensorLayout
...@@ -43,20 +76,88 @@ struct NDHWC : public BaseTensorLayout ...@@ -43,20 +76,88 @@ struct NDHWC : public BaseTensorLayout
}; };
// input tensor // input tensor
// packed NCW/NCHW/NCDHW // packed GNWC/GNHWC/GNDHWC
struct NCW : public BaseTensorLayout struct GNWC : public BaseTensorLayout
{ {
static constexpr const char* name = "NCW"; static constexpr const char* name = "GNWC";
}; };
struct NCHW : public BaseTensorLayout struct GNHWC : public BaseTensorLayout
{ {
static constexpr const char* name = "NCHW"; static constexpr const char* name = "GNHWC";
}; };
struct NCDHW : public BaseTensorLayout struct GNDHWC : public BaseTensorLayout
{ {
static constexpr const char* name = "NCDHW"; static constexpr const char* name = "GNDHWC";
};
// input tensor
// packed GNWC/GNHWC/GNDHWC
struct NWGC : public BaseTensorLayout
{
static constexpr const char* name = "NWGC";
};
struct NHWGC : public BaseTensorLayout
{
static constexpr const char* name = "NHWGC";
};
struct NDHWGC : public BaseTensorLayout
{
static constexpr const char* name = "NDHWGC";
};
// input tensor
// strided layout
struct G_NW_C : public BaseTensorLayout
{
static constexpr const char* name = "G_NW_C";
};
struct G_NHW_C : public BaseTensorLayout
{
static constexpr const char* name = "G_NHW_C";
};
struct G_NDHW_C : public BaseTensorLayout
{
static constexpr const char* name = "G_NDHW_C";
};
// weight tensor
// packed KCX/KCYX/KCZYX
struct KCX : public BaseTensorLayout
{
static constexpr const char* name = "KCX";
};
struct KCYX : public BaseTensorLayout
{
static constexpr const char* name = "KCYX";
};
struct KCZYX : public BaseTensorLayout
{
static constexpr const char* name = "KCZYX";
};
// weight tensor
// packed KCX/KCYX/KCZYX
struct GKCX : public BaseTensorLayout
{
static constexpr const char* name = "GKCX";
};
struct GKCYX : public BaseTensorLayout
{
static constexpr const char* name = "GKCYX";
};
struct GKCZYX : public BaseTensorLayout
{
static constexpr const char* name = "GKCZYX";
}; };
// weight tensor // weight tensor
...@@ -77,20 +178,88 @@ struct KZYXC : public BaseTensorLayout ...@@ -77,20 +178,88 @@ struct KZYXC : public BaseTensorLayout
}; };
// weight tensor // weight tensor
// packed KCX/KCYX/KCZYX // packed GKXC/GKYXC/GKZYXC
struct KCX : public BaseTensorLayout struct GKXC : public BaseTensorLayout
{ {
static constexpr const char* name = "KCX"; static constexpr const char* name = "GKXC";
}; };
struct KCYX : public BaseTensorLayout struct GKYXC : public BaseTensorLayout
{ {
static constexpr const char* name = "KCYX"; static constexpr const char* name = "GKYXC";
}; };
struct KCZYX : public BaseTensorLayout struct GKZYXC : public BaseTensorLayout
{ {
static constexpr const char* name = "KCZYX"; static constexpr const char* name = "GKZYXC";
};
// weight tensor
// packed KXGC/KYXGC/KZYXGC
struct KXGC : public BaseTensorLayout
{
static constexpr const char* name = "KXGC";
};
struct KYXGC : public BaseTensorLayout
{
static constexpr const char* name = "KYXGC";
};
struct KZYXGC : public BaseTensorLayout
{
static constexpr const char* name = "KZYXGC";
};
// weight tensor
// strided
struct G_K_X_C : public BaseTensorLayout
{
static constexpr const char* name = "G_K_X_C";
};
struct G_K_YX_C : public BaseTensorLayout
{
static constexpr const char* name = "G_K_YX_C";
};
struct G_K_ZYX_C : public BaseTensorLayout
{
static constexpr const char* name = "G_K_ZYX_C";
};
// output tensor
// packed NKW/NKHW/NKDHW
struct NKW : public BaseTensorLayout
{
static constexpr const char* name = "NKW";
};
struct NKHW : public BaseTensorLayout
{
static constexpr const char* name = "NKHW";
};
struct NKDHW : public BaseTensorLayout
{
static constexpr const char* name = "NKDHW";
};
// output tensor
// packed GNKW/GNKHW/GNKDHW
struct GNKW : public BaseTensorLayout
{
static constexpr const char* name = "GNKW";
};
struct GNKHW : public BaseTensorLayout
{
static constexpr const char* name = "GNKHW";
};
struct GNKDHW : public BaseTensorLayout
{
static constexpr const char* name = "GNKDHW";
}; };
// output tensor // output tensor
...@@ -111,37 +280,54 @@ struct NDHWK : public BaseTensorLayout ...@@ -111,37 +280,54 @@ struct NDHWK : public BaseTensorLayout
}; };
// output tensor // output tensor
// packed NKW/NKHW/NKDHW // packed GNWK/GNHWK/GNDHWK
struct NKW : public BaseTensorLayout struct GNWK : public BaseTensorLayout
{ {
static constexpr const char* name = "NKW"; static constexpr const char* name = "GNWK";
}; };
struct NKHW : public BaseTensorLayout struct GNHWK : public BaseTensorLayout
{ {
static constexpr const char* name = "NKHW"; static constexpr const char* name = "GNHWK";
}; };
struct NKDHW : public BaseTensorLayout struct GNDHWK : public BaseTensorLayout
{ {
static constexpr const char* name = "NKDHW"; static constexpr const char* name = "GNDHWK";
};
// output tensor
// packed NWGK/NHWGK/NDHWGK
struct NWGK : public BaseTensorLayout
{
static constexpr const char* name = "NWGK";
};
struct NHWGK : public BaseTensorLayout
{
static constexpr const char* name = "NHWGK";
};
struct NDHWGK : public BaseTensorLayout
{
static constexpr const char* name = "NDHWGK";
}; };
// output tensor // output tensor
// strided layout // strided layout
struct NW_K : public BaseTensorLayout struct G_NW_K : public BaseTensorLayout
{ {
static constexpr const char* name = "NW_K"; static constexpr const char* name = "G_NW_K";
}; };
struct NHW_K : public BaseTensorLayout struct G_NHW_K : public BaseTensorLayout
{ {
static constexpr const char* name = "NHW_K"; static constexpr const char* name = "G_NHW_K";
}; };
struct NDHW_K : public BaseTensorLayout struct G_NDHW_K : public BaseTensorLayout
{ {
static constexpr const char* name = "NDHW_K"; static constexpr const char* name = "G_NDHW_K";
}; };
} // namespace convolution } // namespace convolution
......
...@@ -30,12 +30,8 @@ namespace host { ...@@ -30,12 +30,8 @@ namespace host {
// operation. // operation.
// @tparam NumDimSpatial Number of spatial dimensions. // @tparam NumDimSpatial Number of spatial dimensions.
// //
// FIXME: only support NDimSpatial = 1 to 3; only support NCHW and NHWC layout. // tensor descriptor in GNCHW/GKCXY/GNKHW dimensional order
// Need to be more general
template <ck::index_t NumDimSpatial, template <ck::index_t NumDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
...@@ -91,7 +87,6 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -91,7 +87,6 @@ struct ReferenceConvFwd : public device::BaseOperator
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
// tensor descriptor in NCHW/KXYC/NKHW dimensional order
if constexpr(NumDimSpatial == 1) if constexpr(NumDimSpatial == 1)
{ {
auto func = [&](auto g, auto n, auto k, auto wo) { auto func = [&](auto g, auto n, auto k, auto wo) {
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
...@@ -11,109 +12,238 @@ namespace ck { ...@@ -11,109 +12,238 @@ namespace ck {
namespace utils { namespace utils {
namespace conv { namespace conv {
template <typename InLayout> namespace detail {
HostTensorDescriptor get_input_host_tensor_descriptor(const ck::utils::conv::ConvParam& param)
template <typename OldLayout>
std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
{ {
if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWC> || if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCW> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWC> || ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCX> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>) ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKW>)
{
return {0, 1, 2, 3};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCHW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCYX> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKHW>)
{
return {0, 1, 2, 3, 4};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCDHW> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCZYX> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKDHW>)
{
return {0, 1, 2, 3, 4, 5};
}
if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWK>)
{
return {0, 1, 3, 2};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKYXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWK>)
{
return {0, 1, 4, 2, 3};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKZYXC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWK>)
{
return {0, 1, 5, 2, 3, 4};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KXGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGK>)
{
return {2, 0, 3, 1};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KYXGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGK>)
{
return {3, 0, 4, 1, 2};
}
else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KZYXGC> ||
ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGK>)
{
return {4, 0, 5, 1, 2, 3};
}
else
{ {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(param.N_), printf("%s\n", __func__);
static_cast<std::size_t>(param.C_)}; throw std::runtime_error("wrong! unsupported layout");
}
}
nhwc_lengths.insert(nhwc_lengths.begin() + 1, } // namespace detail
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.end()); // make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
// regardless of physical layout
template <typename InLayout>
HostTensorDescriptor
make_input_host_tensor_descriptor_packed(const ck::utils::conv::ConvParam& param)
{
std::vector<std::size_t> physical_lengths;
return HostTensorDescriptor(nhwc_lengths); if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCW> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCHW> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCDHW>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.C_)};
physical_lengths.insert(physical_lengths.end(),
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NCW> || else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNWC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NCHW> || ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNHWC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NCDHW>) ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNDHWC>)
{ {
std::vector<std::size_t> nchw_lengths{static_cast<std::size_t>(param.N_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)}; static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.C_)};
nchw_lengths.insert(nchw_lengths.end(), physical_lengths.insert(physical_lengths.begin() + 2,
param.input_spatial_lengths_.begin(), param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.end()); param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWGC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWGC> ||
ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWGC>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)};
return HostTensorDescriptor(nchw_lengths); physical_lengths.insert(physical_lengths.begin() + 1,
param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else else
{ {
printf("%s\n", __func__);
printf("%s\n", InLayout::name);
throw std::runtime_error("wrong! unsupported layout"); throw std::runtime_error("wrong! unsupported layout");
} }
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<InLayout>());
} }
// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
// regardless of physical layout
template <typename WeiLayout> template <typename WeiLayout>
HostTensorDescriptor get_weight_host_tensor_descriptor(const ck::utils::conv::ConvParam& param) HostTensorDescriptor
make_weight_host_tensor_descriptor_packed(const ck::utils::conv::ConvParam& param)
{ {
if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> || std::vector<std::size_t> physical_lengths;
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
{
std::vector<std::size_t> kyxc_lengths{static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
kyxc_lengths.insert(kyxc_lengths.begin() + 1, if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCX> ||
param.filter_spatial_lengths_.begin(), ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCYX> ||
param.filter_spatial_lengths_.end()); ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCZYX>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
return HostTensorDescriptor(kyxc_lengths); physical_lengths.insert(physical_lengths.end(),
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KCX> || else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KCYX> || ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KCZYX>) ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
{ {
std::vector<std::size_t> kcyx_lengths{static_cast<std::size_t>(param.K_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)}; static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.C_)};
kcyx_lengths.insert(kcyx_lengths.end(), physical_lengths.insert(physical_lengths.begin() + 2,
param.filter_spatial_lengths_.begin(), param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.end()); param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXGC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXGC> ||
ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXGC>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.C_)};
return HostTensorDescriptor(kcyx_lengths); physical_lengths.insert(physical_lengths.begin() + 1,
param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else else
{ {
printf("%s\n", __func__);
throw std::runtime_error("wrong! unsupported layout"); throw std::runtime_error("wrong! unsupported layout");
} }
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
} }
// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
// regardless of physical layout
template <typename OutLayout> template <typename OutLayout>
HostTensorDescriptor get_output_host_tensor_descriptor(const ck::utils::conv::ConvParam& param) HostTensorDescriptor
make_output_host_tensor_descriptor_packed(const ck::utils::conv::ConvParam& param)
{ {
if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWK> || std::vector<std::size_t> physical_lengths;
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
{
std::vector<std::size_t> nhwk_lengths{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
nhwk_lengths.insert(nhwk_lengths.begin() + 1, if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKW> ||
param.output_spatial_lengths_.begin(), ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKHW> ||
param.output_spatial_lengths_.end()); ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKDHW>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
return HostTensorDescriptor(nhwk_lengths); physical_lengths.insert(physical_lengths.end(),
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NKW> || else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNWK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NKHW> || ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNHWK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NKDHW>) ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNDHWK>)
{ {
std::vector<std::size_t> nkhw_lengths{static_cast<std::size_t>(param.N_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_)}; static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.K_)};
nkhw_lengths.insert(nkhw_lengths.end(), physical_lengths.insert(physical_lengths.begin() + 2,
param.output_spatial_lengths_.begin(), param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.end()); param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
}
else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWGK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWGK> ||
ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWGK>)
{
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_)};
return HostTensorDescriptor(nkhw_lengths); physical_lengths.insert(physical_lengths.begin() + 1,
param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else else
{ {
printf("%s\n", __func__);
throw std::runtime_error("wrong! unsupported layout"); throw std::runtime_error("wrong! unsupported layout");
} }
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(physical_lengths),
detail::get_layout_transpose_gnchw_to_old<OutLayout>());
} }
} // namespace conv } // namespace conv
......
...@@ -358,6 +358,19 @@ struct Tensor ...@@ -358,6 +358,19 @@ struct Tensor
mDesc.GetLengths()[4])(num_thread); mDesc.GetLengths()[4])(num_thread);
break; break;
} }
case 6: {
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4, i5);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3],
mDesc.GetLengths()[4],
mDesc.GetLengths()[5])(num_thread);
break;
}
default: throw std::runtime_error("unspported dimension"); default: throw std::runtime_error("unspported dimension");
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment