"vscode:/vscode.git/clone" did not exist on "58a8057011f7fb4e477b7103b8babdd69cfe7f01"
Commit 68fda7c2 authored by Bartlomiej Kocot's avatar Bartlomiej Kocot
Browse files

Add support for groups in Img2Col/Col2Img

parent bec84efb
...@@ -19,7 +19,7 @@ using OutDataType = ck::half_t; ...@@ -19,7 +19,7 @@ using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::GNHWC; using ImageLayout = ck::tensor_layout::convolution::GNHWC;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group) static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
...@@ -52,7 +52,7 @@ int main() ...@@ -52,7 +52,7 @@ int main()
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X}; std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo}; std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space (G is dummy) // We have NHWGC in memory space
// However, CK's API only accept length and stride with order of GNCHW // However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride // Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
...@@ -63,7 +63,7 @@ int main() ...@@ -63,7 +63,7 @@ int main()
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C); SimpleDeviceMem in(sizeof(InDataType) * G * N * Ho * Wo * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C); SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C);
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
...@@ -93,6 +93,7 @@ int main() ...@@ -93,6 +93,7 @@ int main()
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
...@@ -112,7 +113,7 @@ int main() ...@@ -112,7 +113,7 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C; sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -149,6 +150,7 @@ int main() ...@@ -149,6 +150,7 @@ int main()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
......
...@@ -19,7 +19,7 @@ using OutDataType = ck::half_t; ...@@ -19,7 +19,7 @@ using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::GNHWC; using ImageLayout = ck::tensor_layout::convolution::GNHWC;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group) static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
...@@ -64,7 +64,7 @@ int main() ...@@ -64,7 +64,7 @@ int main()
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C); SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C);
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
...@@ -93,6 +93,7 @@ int main() ...@@ -93,6 +93,7 @@ int main()
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
...@@ -112,7 +113,7 @@ int main() ...@@ -112,7 +113,7 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C; sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -149,6 +150,7 @@ int main() ...@@ -149,6 +150,7 @@ int main()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
......
...@@ -20,18 +20,19 @@ using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImage ...@@ -20,18 +20,19 @@ using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImage
bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
{ {
const auto G = conv_params.G_;
const auto N = conv_params.N_; const auto N = conv_params.N_;
const auto C = conv_params.C_; const auto C = conv_params.C_;
const ck::index_t NDoHoWo = const ck::index_t GNDoHoWo =
N * ck::accumulate_n<ck::index_t>( G * N *
conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); ck::accumulate_n<ck::index_t>(
conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const ck::index_t CZYX = const ck::index_t CZYX =
C * ck::accumulate_n<ck::index_t>( C * ck::accumulate_n<ck::index_t>(
conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc = HostTensorDescriptor({NDoHoWo, CZYX}); const auto in_desc = HostTensorDescriptor({GNDoHoWo, CZYX});
const auto out_desc = const auto out_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params); ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
...@@ -86,6 +87,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -86,6 +87,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
auto invoker = col2img.MakeInvoker(); auto invoker = col2img.MakeInvoker();
auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(), auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
...@@ -108,7 +110,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -108,7 +110,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
} }
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); std::size_t num_btype = GNDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
......
...@@ -20,20 +20,21 @@ using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumn ...@@ -20,20 +20,21 @@ using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumn
bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
{ {
const auto G = conv_params.G_;
const auto N = conv_params.N_; const auto N = conv_params.N_;
const auto C = conv_params.C_; const auto C = conv_params.C_;
const ck::index_t NDoHoWo = const ck::index_t GNDoHoWo =
N * ck::accumulate_n<ck::index_t>( G * N *
conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); ck::accumulate_n<ck::index_t>(
conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const ck::index_t CZYX = const ck::index_t CZYX =
C * ck::accumulate_n<ck::index_t>( C * ck::accumulate_n<ck::index_t>(
conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc = const auto in_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params); ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX}); const auto out_desc = HostTensorDescriptor({GNDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
...@@ -86,6 +87,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -86,6 +87,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
auto invoker = img2col.MakeInvoker(); auto invoker = img2col.MakeInvoker();
auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(), auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
...@@ -108,7 +110,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -108,7 +110,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
} }
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); std::size_t num_btype = GNDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
......
...@@ -14,11 +14,12 @@ namespace device { ...@@ -14,11 +14,12 @@ namespace device {
/** /**
* \brief Convolution Tensor Rearrange. * \brief Convolution Tensor Rearrange.
* *
* This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to * This Device operator supports conversion image to
* the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and * the gemm problem (Image to Column) and
* conversion gemm form to the image (Column to Image). * conversion gemm form to the image (Column to Image).
* * Supported layouts:
* Note that G must be equal to 1. * [G, N, Di, Hi, Wi, C] <-> [G * N * Do * Ho * Wo, Z * Y * X * C]
* [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo * G, Z * Y * X * C]
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Input Layout. * \tparam ImageLayout Input Layout.
...@@ -39,6 +40,7 @@ struct DeviceConvTensorRearrange : public BaseOperator ...@@ -39,6 +40,7 @@ struct DeviceConvTensorRearrange : public BaseOperator
* *
* \param p_in A pointer to the device memory of the input image. * \param p_in A pointer to the device memory of the input image.
* \param p_out A pointer to the device memory of the output. * \param p_out A pointer to the device memory of the output.
* \param G Convolution number of groups.
* \param N Convolution batch size. * \param N Convolution batch size.
* \param C Convolution number of channels. * \param C Convolution number of channels.
* \param input_spatial_lengths Input spatial lengths. * \param input_spatial_lengths Input spatial lengths.
...@@ -55,6 +57,7 @@ struct DeviceConvTensorRearrange : public BaseOperator ...@@ -55,6 +57,7 @@ struct DeviceConvTensorRearrange : public BaseOperator
virtual std::unique_ptr<BaseArgument> virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, MakeArgumentPointer(const void* p_in,
void* p_out, void* p_out,
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
......
...@@ -17,15 +17,18 @@ ...@@ -17,15 +17,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
// Image to column for input layout NDHWC: // Column to Image:
// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C] // input : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
// output : image [N, Di, Hi, Wi, C] // output : input image [G, N, Di, Hi, Wi, C]
// input : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
// output : input image [N, Di, Hi, Wi, G, C]
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ImageLayout, typename ImageLayout,
typename InputDataType, typename InputDataType,
...@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl ...@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
OutputDataType, OutputDataType,
conv_tensor_rearrange_op::ColumnToImage> conv_tensor_rearrange_op::ColumnToImage>
{ {
static constexpr bool is_NSpatialGC =
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
static constexpr bool is_GNSpatialC =
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -85,7 +96,8 @@ struct DeviceColumnToImageImpl ...@@ -85,7 +96,8 @@ struct DeviceColumnToImageImpl
// Make column form descriptor // Make column form descriptor
static auto static auto
MakeInputDescriptor_M_K(const ck::index_t N, MakeInputDescriptor_M_K(const ck::index_t G,
const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
...@@ -100,16 +112,19 @@ struct DeviceColumnToImageImpl ...@@ -100,16 +112,19 @@ struct DeviceColumnToImageImpl
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1]; const index_t AdditionalGroupStride = is_NSpatialGC ? G : 1;
const index_t NStride =
DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1] * AdditionalGroupStride;
// Calculate the appropriate stride for each set of independent filters // Calculate the appropriate stride for each set of independent filters
// in each dimension // in each dimension
const index_t WStride = const index_t WStride = math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) *
math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0]; gemm_m_k_strides[I0] * AdditionalGroupStride;
const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) * const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
output_spatial_lengths[XIdx] * gemm_m_k_strides[I0]; output_spatial_lengths[XIdx] * gemm_m_k_strides[I0] *
AdditionalGroupStride;
const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) * const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
gemm_m_k_strides[I0]; gemm_m_k_strides[I0] * AdditionalGroupStride;
// Create descriptor for independent filters in each dimension and // Create descriptor for independent filters in each dimension and
// then merge them into column form // then merge them into column form
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
...@@ -244,7 +259,7 @@ struct DeviceColumnToImageImpl ...@@ -244,7 +259,7 @@ struct DeviceColumnToImageImpl
} }
using InputGridDesc = using InputGridDesc =
remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, {}, {}, {}, {}, {}, {}))>; remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, 1, {}, {}, {}, {}, {}, {}))>;
using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K( using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K(
1, 1, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>; 1, 1, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
...@@ -252,22 +267,25 @@ struct DeviceColumnToImageImpl ...@@ -252,22 +267,25 @@ struct DeviceColumnToImageImpl
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>( decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
InputGridDesc{}))>; InputGridDesc{}))>;
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc, using GridwiseTensorRearrangeKernel =
InputDataType, GridwiseTensorRearrange<InputGridDesc,
OutputGridDesc, InputDataType,
OutputDataType, OutputGridDesc,
BlockSize, OutputDataType,
MPerBlock, BlockSize,
KPerBlock, MPerBlock,
ThreadClusterLengths, KPerBlock,
ScalarPerVector, ThreadClusterLengths,
InMemoryDataOperationEnum::Add, ScalarPerVector,
Block2ETileMap>; InMemoryDataOperationEnum::Add,
Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const void* p_in, // input image Argument(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
...@@ -279,7 +297,8 @@ struct DeviceColumnToImageImpl ...@@ -279,7 +297,8 @@ struct DeviceColumnToImageImpl
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads) const std::array<index_t, NDimSpatial>& input_right_pads)
: C_(C), : G_(G),
C_(C),
X_(filter_spatial_lengths[NDimSpatial - I1]), X_(filter_spatial_lengths[NDimSpatial - I1]),
p_in_{static_cast<const InputDataType*>(p_in)}, p_in_{static_cast<const InputDataType*>(p_in)},
p_out_{static_cast<OutputDataType*>(p_out)}, p_out_{static_cast<OutputDataType*>(p_out)},
...@@ -289,6 +308,23 @@ struct DeviceColumnToImageImpl ...@@ -289,6 +308,23 @@ struct DeviceColumnToImageImpl
input_left_pads_{input_left_pads}, input_left_pads_{input_left_pads},
input_right_pads_{input_right_pads} input_right_pads_{input_right_pads}
{ {
using namespace tensor_layout::convolution;
if constexpr(is_NSpatialGC)
{
compute_ptr_offset_of_batch_.BatchStrideA_ =
gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
}
else if constexpr(is_GNSpatialC)
{
const index_t NDoHoWo =
N * ck::accumulate_n<index_t>(
output_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
compute_ptr_offset_of_batch_.BatchStrideA_ =
NDoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
}
compute_ptr_offset_of_batch_.BatchStrideC_ = image_g_n_c_wis_strides[I0];
const index_t x_eff = const index_t x_eff =
(filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1; (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
const index_t y_eff = const index_t y_eff =
...@@ -349,7 +385,8 @@ struct DeviceColumnToImageImpl ...@@ -349,7 +385,8 @@ struct DeviceColumnToImageImpl
continue; continue;
const auto in_grid_desc_m_k = const auto in_grid_desc_m_k =
MakeInputDescriptor_M_K(N, MakeInputDescriptor_M_K(G,
N,
C, C,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
...@@ -384,13 +421,13 @@ struct DeviceColumnToImageImpl ...@@ -384,13 +421,13 @@ struct DeviceColumnToImageImpl
const index_t z_offset_with_pad = const index_t z_offset_with_pad =
math::max(0, z_img_offset - input_left_pads[ZIdx]); math::max(0, z_img_offset - input_left_pads[ZIdx]);
const index_t AdditionalGroupStride = is_NSpatialGC ? G : 1;
// Memory offsets to next set of independent filters, // Memory offsets to next set of independent filters,
// move to independent filters in each dimension // move to independent filters in each dimension
const index_t in_offset = const index_t in_offset =
x_idx * gemm_m_k_strides[0] + (x_idx + y_idx * output_spatial_lengths[XIdx] +
y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] + z_idx * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx]) *
z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] * gemm_m_k_strides[0] * AdditionalGroupStride;
output_spatial_lengths[XIdx];
// Move to independent filters in appropriate dimensions // Move to independent filters in appropriate dimensions
const index_t out_offset = const index_t out_offset =
x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] + x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
...@@ -417,6 +454,7 @@ struct DeviceColumnToImageImpl ...@@ -417,6 +454,7 @@ struct DeviceColumnToImageImpl
} }
} }
const ck::index_t G_;
const ck::index_t C_; const ck::index_t C_;
const ck::index_t X_; const ck::index_t X_;
...@@ -434,6 +472,8 @@ struct DeviceColumnToImageImpl ...@@ -434,6 +472,8 @@ struct DeviceColumnToImageImpl
std::vector<const InputDataType*> p_in_container_; std::vector<const InputDataType*> p_in_container_;
std::vector<OutputDataType*> p_out_container_; std::vector<OutputDataType*> p_out_container_;
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
...@@ -451,6 +491,7 @@ struct DeviceColumnToImageImpl ...@@ -451,6 +491,7 @@ struct DeviceColumnToImageImpl
OutputGridDesc, OutputGridDesc,
OutputDataType, OutputDataType,
Block2ETileMap, Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>,
GridwiseTensorRearrangeKernel>; GridwiseTensorRearrangeKernel>;
// Execute each set of independent filters // Execute each set of independent filters
...@@ -470,7 +511,9 @@ struct DeviceColumnToImageImpl ...@@ -470,7 +511,9 @@ struct DeviceColumnToImageImpl
arg.p_in_container_[i], arg.p_in_container_[i],
arg.out_grid_desc_m_k_container_[i], arg.out_grid_desc_m_k_container_[i],
arg.p_out_container_[i], arg.p_out_container_[i],
block_2_tile_map); arg.G_,
block_2_tile_map,
arg.compute_ptr_offset_of_batch_);
} }
return elapsed_time; return elapsed_time;
} }
...@@ -485,8 +528,7 @@ struct DeviceColumnToImageImpl ...@@ -485,8 +528,7 @@ struct DeviceColumnToImageImpl
bool IsSupportedArgument(const Argument& arg) bool IsSupportedArgument(const Argument& arg)
{ {
using namespace tensor_layout::convolution; using namespace tensor_layout::convolution;
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> || if constexpr(!(is_NSpatialGC || is_GNSpatialC))
std::is_same_v<ImageLayout, GNDHWC>))
{ {
return false; return false;
} }
...@@ -534,6 +576,7 @@ struct DeviceColumnToImageImpl ...@@ -534,6 +576,7 @@ struct DeviceColumnToImageImpl
static auto MakeArgument(const void* p_in, // input image static auto MakeArgument(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
...@@ -548,6 +591,7 @@ struct DeviceColumnToImageImpl ...@@ -548,6 +591,7 @@ struct DeviceColumnToImageImpl
{ {
return Argument{static_cast<const InputDataType*>(p_in), return Argument{static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
...@@ -566,6 +610,7 @@ struct DeviceColumnToImageImpl ...@@ -566,6 +610,7 @@ struct DeviceColumnToImageImpl
std::unique_ptr<BaseArgument> std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, // input image MakeArgumentPointer(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
...@@ -580,6 +625,7 @@ struct DeviceColumnToImageImpl ...@@ -580,6 +625,7 @@ struct DeviceColumnToImageImpl
{ {
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in), return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
......
...@@ -15,15 +15,18 @@ ...@@ -15,15 +15,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
// Image to column for input layout NDHWC: // Image to column:
// input : input image [N, Di, Hi, Wi, C] // input : input image [G, N, Di, Hi, Wi, C]
// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C] // output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
// input : input image [N, Di, Hi, Wi, G, C]
// output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ImageLayout, typename ImageLayout,
typename InputDataType, typename InputDataType,
...@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl ...@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
OutputDataType, OutputDataType,
conv_tensor_rearrange_op::ImageToColumn> conv_tensor_rearrange_op::ImageToColumn>
{ {
static constexpr bool is_NSpatialGC =
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
static constexpr bool is_GNSpatialC =
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -105,7 +116,8 @@ struct DeviceImageToColumnImpl ...@@ -105,7 +116,8 @@ struct DeviceImageToColumnImpl
} }
static auto static auto
MakeOutDescriptor_M_K(const ck::index_t N, MakeOutDescriptor_M_K(const ck::index_t G,
const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
...@@ -117,37 +129,49 @@ struct DeviceImageToColumnImpl ...@@ -117,37 +129,49 @@ struct DeviceImageToColumnImpl
const index_t CZYX = const index_t CZYX =
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1]));
const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw); if constexpr(is_NSpatialGC)
return desc_m_k; {
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
make_tuple(NDoHoWo, CZYX),
make_tuple(gemm_m_k_strides[I0] * G, gemm_m_k_strides[I1]));
return matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
}
else if constexpr(is_GNSpatialC)
{
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1]));
return matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
}
} }
using InputGridDesc = using InputGridDesc =
remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, {}, {}, {}, {}, {}, {}, {}, {}))>; remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, {}, {}, {}, {}, {}, {}, {}, {}))>;
using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K(1, 1, {}, {}, {}))>; using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K(1, 1, 1, {}, {}, {}))>;
using Block2ETileMap = remove_cvref_t< using Block2ETileMap = remove_cvref_t<
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>( decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
OutputGridDesc{}))>; OutputGridDesc{}))>;
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc, using GridwiseTensorRearrangeKernel =
InputDataType, GridwiseTensorRearrange<InputGridDesc,
OutputGridDesc, InputDataType,
OutputDataType, OutputGridDesc,
BlockSize, OutputDataType,
MPerBlock, BlockSize,
KPerBlock, MPerBlock,
ThreadClusterLengths, KPerBlock,
ScalarPerVector, ThreadClusterLengths,
InMemoryDataOperationEnum::Set, ScalarPerVector,
Block2ETileMap>; InMemoryDataOperationEnum::Set,
Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const void* p_in, // input image Argument(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
...@@ -159,7 +183,8 @@ struct DeviceImageToColumnImpl ...@@ -159,7 +183,8 @@ struct DeviceImageToColumnImpl
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads) const std::array<index_t, NDimSpatial>& input_right_pads)
: C_(C), : G_(G),
C_(C),
X_(filter_spatial_lengths[NDimSpatial - I1]), X_(filter_spatial_lengths[NDimSpatial - I1]),
p_in_{static_cast<const InputDataType*>(p_in)}, p_in_{static_cast<const InputDataType*>(p_in)},
p_out_{static_cast<OutputDataType*>(p_out)}, p_out_{static_cast<OutputDataType*>(p_out)},
...@@ -176,14 +201,28 @@ struct DeviceImageToColumnImpl ...@@ -176,14 +201,28 @@ struct DeviceImageToColumnImpl
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
out_grid_desc_m_k_ = MakeOutDescriptor_M_K( out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides); G, N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides);
compute_ptr_offset_of_batch_.BatchStrideA_ = image_g_n_c_wis_strides[I0];
if constexpr(is_NSpatialGC)
{
compute_ptr_offset_of_batch_.BatchStrideC_ =
gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
}
else if constexpr(is_GNSpatialC)
{
const index_t NDoHoWo =
N * ck::accumulate_n<index_t>(
output_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
compute_ptr_offset_of_batch_.BatchStrideC_ =
NDoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
}
} }
void Print() const void Print() const
...@@ -192,6 +231,7 @@ struct DeviceImageToColumnImpl ...@@ -192,6 +231,7 @@ struct DeviceImageToColumnImpl
std::cout << out_grid_desc_m_k_ << std::endl; std::cout << out_grid_desc_m_k_ << std::endl;
} }
const ck::index_t G_;
const ck::index_t C_; const ck::index_t C_;
const ck::index_t X_; const ck::index_t X_;
...@@ -206,6 +246,8 @@ struct DeviceImageToColumnImpl ...@@ -206,6 +246,8 @@ struct DeviceImageToColumnImpl
InputGridDesc in_grid_desc_m_k_; InputGridDesc in_grid_desc_m_k_;
OutputGridDesc out_grid_desc_m_k_; OutputGridDesc out_grid_desc_m_k_;
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
...@@ -226,6 +268,7 @@ struct DeviceImageToColumnImpl ...@@ -226,6 +268,7 @@ struct DeviceImageToColumnImpl
OutputGridDesc, OutputGridDesc,
OutputDataType, OutputDataType,
Block2ETileMap, Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>,
GridwiseTensorRearrangeKernel>; GridwiseTensorRearrangeKernel>;
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
...@@ -237,7 +280,9 @@ struct DeviceImageToColumnImpl ...@@ -237,7 +280,9 @@ struct DeviceImageToColumnImpl
arg.p_in_, arg.p_in_,
arg.out_grid_desc_m_k_, arg.out_grid_desc_m_k_,
arg.p_out_, arg.p_out_,
block_2_tile_map); arg.G_,
block_2_tile_map,
arg.compute_ptr_offset_of_batch_);
return elapsed_time; return elapsed_time;
} }
...@@ -250,9 +295,7 @@ struct DeviceImageToColumnImpl ...@@ -250,9 +295,7 @@ struct DeviceImageToColumnImpl
bool IsSupportedArgument(const Argument& arg) bool IsSupportedArgument(const Argument& arg)
{ {
using namespace tensor_layout::convolution; if constexpr(!(is_NSpatialGC || is_GNSpatialC))
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
std::is_same_v<ImageLayout, GNDHWC>))
{ {
return false; return false;
} }
...@@ -295,6 +338,7 @@ struct DeviceImageToColumnImpl ...@@ -295,6 +338,7 @@ struct DeviceImageToColumnImpl
static auto MakeArgument(const void* p_in, // input image static auto MakeArgument(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
...@@ -309,6 +353,7 @@ struct DeviceImageToColumnImpl ...@@ -309,6 +353,7 @@ struct DeviceImageToColumnImpl
{ {
return Argument{static_cast<const InputDataType*>(p_in), return Argument{static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
...@@ -327,6 +372,7 @@ struct DeviceImageToColumnImpl ...@@ -327,6 +372,7 @@ struct DeviceImageToColumnImpl
std::unique_ptr<BaseArgument> std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, // input image MakeArgumentPointer(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
...@@ -341,6 +387,7 @@ struct DeviceImageToColumnImpl ...@@ -341,6 +387,7 @@ struct DeviceImageToColumnImpl
{ {
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in), return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
......
...@@ -21,6 +21,7 @@ template <typename InputGridDesc, ...@@ -21,6 +21,7 @@ template <typename InputGridDesc,
typename OutputGridDesc, typename OutputGridDesc,
typename OutputDataType, typename OutputDataType,
typename Block2ETileMap, typename Block2ETileMap,
typename ComputePtrOffsetOfStridedBatch,
typename GridwiseTensorRearrangeKernel> typename GridwiseTensorRearrangeKernel>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
...@@ -30,13 +31,20 @@ __global__ void ...@@ -30,13 +31,20 @@ __global__ void
const InputDataType* __restrict__ p_in_global, const InputDataType* __restrict__ p_in_global,
const OutputGridDesc out_grid_desc, const OutputGridDesc out_grid_desc,
OutputDataType* __restrict__ p_out_global, OutputDataType* __restrict__ p_out_global,
const Block2ETileMap block_2_tile_map) const index_t batch_count,
const Block2ETileMap block_2_tile_map,
const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \ defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__)) defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseTensorRearrangeKernel::Run( GridwiseTensorRearrangeKernel::Run(in_grid_desc,
in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map); p_in_global,
out_grid_desc,
p_out_global,
batch_count,
block_2_tile_map,
compute_ptr_offset_of_batch);
#else #else
ignore = in_grid_desc; ignore = in_grid_desc;
ignore = p_in_global; ignore = p_in_global;
...@@ -56,7 +64,8 @@ template <typename InputGridDesc, ...@@ -56,7 +64,8 @@ template <typename InputGridDesc,
typename ThreadClusterLengths, typename ThreadClusterLengths,
index_t ScalarPerVector, index_t ScalarPerVector,
InMemoryDataOperationEnum DstInMemOp, InMemoryDataOperationEnum DstInMemOp,
typename Block2ETileMap> typename Block2ETileMap,
typename ComputePtrOffsetOfStridedBatch>
struct GridwiseTensorRearrange struct GridwiseTensorRearrange
{ {
...@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange ...@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
const InputDataType* __restrict__ p_in_global, const InputDataType* __restrict__ p_in_global,
const OutputGridDesc& out_grid_desc, const OutputGridDesc& out_grid_desc,
OutputDataType* __restrict__ p_out_global, OutputDataType* __restrict__ p_out_global,
const Block2ETileMap& block_2_tile_map) const index_t batch_count,
const Block2ETileMap& block_2_tile_map,
const ComputePtrOffsetOfStridedBatch& compute_ptr_offset_of_batch)
{ {
const auto block_work_idx = const auto block_work_idx =
block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
...@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange ...@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
const index_t k_block_data_idx_on_grid = const index_t k_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock); __builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock);
// Global Memory
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global, in_grid_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global, out_grid_desc.GetElementSpaceSize());
auto copy_global_to_global = auto copy_global_to_global =
ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock, ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock,
Tuple<InputDataType>, Tuple<InputDataType>,
...@@ -108,8 +113,22 @@ struct GridwiseTensorRearrange ...@@ -108,8 +113,22 @@ struct GridwiseTensorRearrange
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
tensor_operation::element_wise::PassThrough{}}; tensor_operation::element_wise::PassThrough{}};
copy_global_to_global.Run( for(index_t idx = 0; idx < batch_count; idx++)
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf)); {
// Global Memory
const index_t a_batch_offset =
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(idx));
const index_t c_batch_offset =
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(idx));
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
copy_global_to_global.Run(
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
}
} }
__host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc, __host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc,
......
...@@ -19,9 +19,7 @@ namespace host { ...@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for column to image. * \brief Reference implementation for column to image.
* *
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout. * Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. * Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout. * \tparam ImageLayout Image Layout.
...@@ -94,23 +92,33 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -94,23 +92,33 @@ struct ReferenceColumnToImage : public device::BaseOperator
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
using namespace ck::tensor_layout::convolution;
if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 && if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 &&
arg.input_.GetNumOfDimension() == 2)) arg.input_.GetNumOfDimension() == 2))
{ {
throw std::runtime_error("wrong! inconsistent dimension"); throw std::runtime_error("wrong! inconsistent dimension");
} }
const index_t G = arg.output_.GetLengths()[0];
const index_t N = arg.output_.GetLengths()[1]; const index_t N = arg.output_.GetLengths()[1];
const index_t C = arg.output_.GetLengths()[2]; const index_t C = arg.output_.GetLengths()[2];
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const index_t Wo = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[0];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t wo = 0; wo < Wo; ++wo) for(index_t wo = 0; wo < Wo; ++wo)
{ {
index_t row = n * Wo + wo; index_t row = n * Wo + wo;
index_t column = 0; index_t column = 0;
if constexpr(std::is_same_v<ImageLayout, GNWC>)
{
row = g * N * Wo + n * Wo + wo;
}
else if constexpr(std::is_same_v<ImageLayout, NWGC>)
{
row = n * Wo * G + wo * G + g;
}
for(index_t x = 0; x < arg.filter_spatial_lengths_[0]; ++x) for(index_t x = 0; x < arg.filter_spatial_lengths_[0]; ++x)
{ {
...@@ -124,8 +132,8 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -124,8 +132,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3]) ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3])
{ {
float v_in = ck::type_convert<float>(arg.input_(row, column)); float v_in = ck::type_convert<float>(arg.input_(row, column));
float v_out = ck::type_convert<float>(arg.output_(0, n, c, wi)); float v_out = ck::type_convert<float>(arg.output_(g, n, c, wi));
arg.output_(0, n, c, wi) = arg.output_(g, n, c, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -134,7 +142,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -134,7 +142,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -143,13 +151,21 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -143,13 +151,21 @@ struct ReferenceColumnToImage : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Ho = arg.output_spatial_lengths_[0];
const index_t Wo = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[1];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t ho = 0; ho < Ho; ++ho) for(index_t ho = 0; ho < Ho; ++ho)
{ {
for(index_t wo = 0; wo < Wo; ++wo) for(index_t wo = 0; wo < Wo; ++wo)
{ {
index_t row = n * Ho * Wo + ho * Wo + wo; index_t row = 0;
index_t column = 0; index_t column = 0;
if constexpr(std::is_same_v<ImageLayout, GNHWC>)
{
row = g * N * Ho * Wo + n * Ho * Wo + ho * Wo + wo;
}
else if constexpr(std::is_same_v<ImageLayout, NHWGC>)
{
row = n * Ho * Wo * G + ho * Wo * G + wo * G + g;
}
for(index_t y = 0; y < arg.filter_spatial_lengths_[0]; ++y) for(index_t y = 0; y < arg.filter_spatial_lengths_[0]; ++y)
{ {
...@@ -178,8 +194,8 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -178,8 +194,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
float v_in = float v_in =
ck::type_convert<float>(arg.input_(row, column)); ck::type_convert<float>(arg.input_(row, column));
float v_out = ck::type_convert<float>( float v_out = ck::type_convert<float>(
arg.output_(0, n, c, hi, wi)); arg.output_(g, n, c, hi, wi));
arg.output_(0, n, c, hi, wi) = arg.output_(g, n, c, hi, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -190,7 +206,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -190,7 +206,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -200,15 +216,25 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -200,15 +216,25 @@ struct ReferenceColumnToImage : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Ho = arg.output_spatial_lengths_[1];
const index_t Wo = arg.output_spatial_lengths_[2]; const index_t Wo = arg.output_spatial_lengths_[2];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t d_o = 0; d_o < Do; ++d_o) for(index_t d_o = 0; d_o < Do; ++d_o)
{ {
for(index_t ho = 0; ho < Ho; ++ho) for(index_t ho = 0; ho < Ho; ++ho)
{ {
for(index_t wo = 0; wo < Wo; ++wo) for(index_t wo = 0; wo < Wo; ++wo)
{ {
index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo; index_t row = 0;
index_t column = 0; index_t column = 0;
if constexpr(std::is_same_v<ImageLayout, GNDHWC>)
{
row = g * N * Do * Ho * Wo + n * Do * Ho * Wo + d_o * Ho * Wo +
ho * Wo + wo;
}
else if constexpr(std::is_same_v<ImageLayout, NDHWGC>)
{
row = n * Do * Ho * Wo * G + d_o * Ho * Wo * G + ho * Wo * G +
wo * G + g;
}
for(index_t z = 0; z < arg.filter_spatial_lengths_[0]; ++z) for(index_t z = 0; z < arg.filter_spatial_lengths_[0]; ++z)
{ {
...@@ -247,8 +273,8 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -247,8 +273,8 @@ struct ReferenceColumnToImage : public device::BaseOperator
float v_in = ck::type_convert<float>( float v_in = ck::type_convert<float>(
arg.input_(row, column)); arg.input_(row, column));
float v_out = ck::type_convert<float>( float v_out = ck::type_convert<float>(
arg.output_(0, n, c, di, hi, wi)); arg.output_(g, n, c, di, hi, wi));
arg.output_(0, n, c, di, hi, wi) = arg.output_(g, n, c, di, hi, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -261,7 +287,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -261,7 +287,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
......
...@@ -19,9 +19,7 @@ namespace host { ...@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for image to column. * \brief Reference implementation for image to column.
* *
* Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C]. * Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout. * \tparam ImageLayout Image Layout.
...@@ -94,21 +92,31 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -94,21 +92,31 @@ struct ReferenceImageToColumn : public device::BaseOperator
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
using namespace ck::tensor_layout::convolution;
if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 && if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
arg.output_.GetNumOfDimension() == 2)) arg.output_.GetNumOfDimension() == 2))
{ {
throw std::runtime_error("wrong! inconsistent dimension"); throw std::runtime_error("wrong! inconsistent dimension");
} }
const index_t G = arg.input_.GetLengths()[0];
const index_t N = arg.input_.GetLengths()[1]; const index_t N = arg.input_.GetLengths()[1];
const index_t C = arg.input_.GetLengths()[2]; const index_t C = arg.input_.GetLengths()[2];
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const index_t Wo = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[0];
auto func = [&](auto n, auto wo) { auto func = [&](auto g, auto n, auto wo) {
index_t row = n * Wo + wo; index_t row = 0;
index_t column = 0; index_t column = 0;
if constexpr(std::is_same_v<ImageLayout, GNWC>)
{
row = g * N * Wo + n * Wo + wo;
}
else if constexpr(std::is_same_v<ImageLayout, NWGC>)
{
row = n * Wo * G + wo * G + g;
}
for(index_t x = 0; x < arg.filter_spatial_lengths_[0]; ++x) for(index_t x = 0; x < arg.filter_spatial_lengths_[0]; ++x)
{ {
...@@ -121,7 +129,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -121,7 +129,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
if(wi >= 0 && if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
{ {
InDataType v_in = arg.input_(0, n, c, wi); InDataType v_in = arg.input_(g, n, c, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
...@@ -129,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -129,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -138,9 +146,17 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -138,9 +146,17 @@ struct ReferenceImageToColumn : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Ho = arg.output_spatial_lengths_[0];
const index_t Wo = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[1];
auto func = [&](auto n, auto ho, auto wo) { auto func = [&](auto g, auto n, auto ho, auto wo) {
index_t row = n * Ho * Wo + ho * Wo + wo; index_t row = 0;
index_t column = 0; index_t column = 0;
if constexpr(std::is_same_v<ImageLayout, GNHWC>)
{
row = g * N * Ho * Wo + n * Ho * Wo + ho * Wo + wo;
}
else if constexpr(std::is_same_v<ImageLayout, NHWGC>)
{
row = n * Ho * Wo * G + ho * Wo * G + wo * G + g;
}
for(index_t y = 0; y < arg.filter_spatial_lengths_[0]; ++y) for(index_t y = 0; y < arg.filter_spatial_lengths_[0]; ++y)
{ {
...@@ -162,7 +178,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -162,7 +178,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
wi >= 0 && wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
{ {
InDataType v_in = arg.input_(0, n, c, hi, wi); InDataType v_in = arg.input_(g, n, c, hi, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
...@@ -171,7 +187,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -171,7 +187,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -181,9 +197,18 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -181,9 +197,18 @@ struct ReferenceImageToColumn : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Ho = arg.output_spatial_lengths_[1];
const index_t Wo = arg.output_spatial_lengths_[2]; const index_t Wo = arg.output_spatial_lengths_[2];
auto func = [&](auto n, auto d_o, auto ho, auto wo) { auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo; index_t row = 0;
index_t column = 0; index_t column = 0;
if constexpr(std::is_same_v<ImageLayout, GNDHWC>)
{
row =
g * N * Do * Ho * Wo + n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
}
else if constexpr(std::is_same_v<ImageLayout, NDHWGC>)
{
row = n * Do * Ho * Wo * G + d_o * Ho * Wo * G + ho * Wo * G + wo * G + g;
}
for(index_t z = 0; z < arg.filter_spatial_lengths_[0]; ++z) for(index_t z = 0; z < arg.filter_spatial_lengths_[0]; ++z)
{ {
...@@ -213,7 +238,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -213,7 +238,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
ck::type_convert<std::size_t>(wi) < ck::type_convert<std::size_t>(wi) <
arg.input_.GetLengths()[5]) arg.input_.GetLengths()[5])
{ {
InDataType v_in = arg.input_(0, n, c, di, hi, wi); InDataType v_in = arg.input_(g, n, c, di, hi, wi);
arg.output_(row, column) = arg.output_(row, column) =
ck::type_convert<OutDataType>(v_in); ck::type_convert<OutDataType>(v_in);
} }
...@@ -224,7 +249,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -224,7 +249,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N, Do, Ho, Wo)( make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
......
...@@ -19,109 +19,214 @@ namespace instance { ...@@ -19,109 +19,214 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
// GNWC/GNHWC/GNDHWC
// Image to Column // Image to Column
// nhwc, 1d // nhwc, 1d
void add_device_image_to_column_nwc_1d_bf16_instances( void add_device_image_to_column_gnwc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_nwc_1d_f16_instances( void add_device_image_to_column_gnwc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_nwc_1d_f32_instances( void add_device_image_to_column_gnwc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_nwc_1d_i8_instances( void add_device_image_to_column_gnwc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
instances); instances);
// nhwc, 2d // nhwc, 2d
void add_device_image_to_column_nhwc_2d_bf16_instances( void add_device_image_to_column_gnhwc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_nhwc_2d_f16_instances( void add_device_image_to_column_gnhwc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_nhwc_2d_f32_instances( void add_device_image_to_column_gnhwc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_nhwc_2d_i8_instances( void add_device_image_to_column_gnhwc_2d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>& std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
instances); instances);
// nhwc, 3d // nhwc, 3d
void add_device_image_to_column_ndhwc_3d_bf16_instances( void add_device_image_to_column_gndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_ndhwc_3d_f16_instances( void add_device_image_to_column_gndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_ndhwc_3d_f32_instances( void add_device_image_to_column_gndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
instances); instances);
void add_device_image_to_column_ndhwc_3d_i8_instances( void add_device_image_to_column_gndhwc_3d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>& std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
instances); instances);
// Column to Image // Column to Image
// nhwc, 1d // nhwc, 1d
void add_device_column_to_image_nwc_1d_bf16_instances( void add_device_column_to_image_gnwc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_nwc_1d_f16_instances( void add_device_column_to_image_gnwc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_nwc_1d_f32_instances( void add_device_column_to_image_gnwc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_nwc_1d_i8_instances( void add_device_column_to_image_gnwc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
instances); instances);
// nhwc, 2d // nhwc, 2d
void add_device_column_to_image_nhwc_2d_bf16_instances( void add_device_column_to_image_gnhwc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_nhwc_2d_f16_instances( void add_device_column_to_image_gnhwc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_nhwc_2d_f32_instances( void add_device_column_to_image_gnhwc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_nhwc_2d_i8_instances( void add_device_column_to_image_gnhwc_2d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
instances); instances);
// nhwc, 3d // nhwc, 3d
void add_device_column_to_image_ndhwc_3d_bf16_instances( void add_device_column_to_image_gndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_ndhwc_3d_f16_instances( void add_device_column_to_image_gndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_ndhwc_3d_f32_instances( void add_device_column_to_image_gndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
instances); instances);
void add_device_column_to_image_ndhwc_3d_i8_instances( void add_device_column_to_image_gndhwc_3d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
instances); instances);
// NWGC/NHWGC/NDHWGC
// Image to Column
// nhwc, 1d
void add_device_image_to_column_nwgc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ImageToColumn>>>&
instances);
void add_device_image_to_column_nwgc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ImageToColumn>>>&
instances);
void add_device_image_to_column_nwgc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ImageToColumn>>>&
instances);
void add_device_image_to_column_nwgc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ImageToColumn>>>&
instances);
// nhwc, 2d
void add_device_image_to_column_nhwgc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ImageToColumn>>>&
instances);
void add_device_image_to_column_nhwgc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ImageToColumn>>>&
instances);
void add_device_image_to_column_nhwgc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ImageToColumn>>>&
instances);
void add_device_image_to_column_nhwgc_2d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ImageToColumn>>>&
instances);
// nhwc, 3d
void add_device_image_to_column_ndhwgc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ImageToColumn>>>&
instances);
void add_device_image_to_column_ndhwgc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ImageToColumn>>>&
instances);
void add_device_image_to_column_ndhwgc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ImageToColumn>>>&
instances);
void add_device_image_to_column_ndhwgc_3d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ImageToColumn>>>&
instances);
// Column to Image
// nhwc, 1d
void add_device_column_to_image_nwgc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ColumnToImage>>>&
instances);
void add_device_column_to_image_nwgc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ColumnToImage>>>&
instances);
void add_device_column_to_image_nwgc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ColumnToImage>>>&
instances);
void add_device_column_to_image_nwgc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ColumnToImage>>>&
instances);
// nhwc, 2d
void add_device_column_to_image_nhwgc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ColumnToImage>>>&
instances);
void add_device_column_to_image_nhwgc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ColumnToImage>>>&
instances);
void add_device_column_to_image_nhwgc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ColumnToImage>>>&
instances);
void add_device_column_to_image_nhwgc_2d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ColumnToImage>>>&
instances);
// nhwc, 3d
void add_device_column_to_image_ndhwgc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
instances);
void add_device_column_to_image_ndhwgc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
instances);
void add_device_column_to_image_ndhwgc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
instances);
void add_device_column_to_image_ndhwgc_3d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
instances);
template <ck::index_t NumDimSpatial, template <ck::index_t NumDimSpatial,
typename ImageLayout, typename ImageLayout,
...@@ -151,60 +256,120 @@ struct DeviceOperationInstanceFactory< ...@@ -151,60 +256,120 @@ struct DeviceOperationInstanceFactory<
{ {
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>) if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{ {
add_device_image_to_column_nwc_1d_f32_instances(op_ptrs); add_device_image_to_column_gnwc_1d_f32_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>) else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{ {
add_device_image_to_column_nwc_1d_f16_instances(op_ptrs); add_device_image_to_column_gnwc_1d_f16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, ck::bhalf_t> && else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>) is_same_v<OutDataType, ck::bhalf_t>)
{ {
add_device_image_to_column_nwc_1d_bf16_instances(op_ptrs); add_device_image_to_column_gnwc_1d_bf16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>) else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{ {
add_device_image_to_column_nwc_1d_i8_instances(op_ptrs); add_device_image_to_column_gnwc_1d_i8_instances(op_ptrs);
} }
} }
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>) else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
{ {
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>) if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{ {
add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs); add_device_image_to_column_gnhwc_2d_f32_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>) else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{ {
add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs); add_device_image_to_column_gnhwc_2d_f16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, ck::bhalf_t> && else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>) is_same_v<OutDataType, ck::bhalf_t>)
{ {
add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs); add_device_image_to_column_gnhwc_2d_bf16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>) else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{ {
add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs); add_device_image_to_column_gnhwc_2d_i8_instances(op_ptrs);
} }
} }
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>) else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
{ {
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>) if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{ {
add_device_image_to_column_ndhwc_3d_f32_instances(op_ptrs); add_device_image_to_column_gndhwc_3d_f32_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>) else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{ {
add_device_image_to_column_ndhwc_3d_f16_instances(op_ptrs); add_device_image_to_column_gndhwc_3d_f16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, ck::bhalf_t> && else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>) is_same_v<OutDataType, ck::bhalf_t>)
{ {
add_device_image_to_column_ndhwc_3d_bf16_instances(op_ptrs); add_device_image_to_column_gndhwc_3d_bf16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>) else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{ {
add_device_image_to_column_ndhwc_3d_i8_instances(op_ptrs); add_device_image_to_column_gndhwc_3d_i8_instances(op_ptrs);
}
}
else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
{
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{
add_device_image_to_column_nwgc_1d_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{
add_device_image_to_column_nwgc_1d_f16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>)
{
add_device_image_to_column_nwgc_1d_bf16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{
add_device_image_to_column_nwgc_1d_i8_instances(op_ptrs);
}
}
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
{
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{
add_device_image_to_column_nhwgc_2d_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{
add_device_image_to_column_nhwgc_2d_f16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>)
{
add_device_image_to_column_nhwgc_2d_bf16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{
add_device_image_to_column_nhwgc_2d_i8_instances(op_ptrs);
}
}
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
{
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{
add_device_image_to_column_ndhwgc_3d_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{
add_device_image_to_column_ndhwgc_3d_f16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>)
{
add_device_image_to_column_ndhwgc_3d_bf16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{
add_device_image_to_column_ndhwgc_3d_i8_instances(op_ptrs);
} }
} }
} }
...@@ -214,60 +379,120 @@ struct DeviceOperationInstanceFactory< ...@@ -214,60 +379,120 @@ struct DeviceOperationInstanceFactory<
{ {
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>) if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{ {
add_device_column_to_image_nwc_1d_f32_instances(op_ptrs); add_device_column_to_image_gnwc_1d_f32_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>) else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{ {
add_device_column_to_image_nwc_1d_f16_instances(op_ptrs); add_device_column_to_image_gnwc_1d_f16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, ck::bhalf_t> && else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>) is_same_v<OutDataType, ck::bhalf_t>)
{ {
add_device_column_to_image_nwc_1d_bf16_instances(op_ptrs); add_device_column_to_image_gnwc_1d_bf16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>) else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{ {
add_device_column_to_image_nwc_1d_i8_instances(op_ptrs); add_device_column_to_image_gnwc_1d_i8_instances(op_ptrs);
} }
} }
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>) else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
{ {
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>) if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{ {
add_device_column_to_image_nhwc_2d_f32_instances(op_ptrs); add_device_column_to_image_gnhwc_2d_f32_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>) else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{ {
add_device_column_to_image_nhwc_2d_f16_instances(op_ptrs); add_device_column_to_image_gnhwc_2d_f16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, ck::bhalf_t> && else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>) is_same_v<OutDataType, ck::bhalf_t>)
{ {
add_device_column_to_image_nhwc_2d_bf16_instances(op_ptrs); add_device_column_to_image_gnhwc_2d_bf16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>) else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{ {
add_device_column_to_image_nhwc_2d_i8_instances(op_ptrs); add_device_column_to_image_gnhwc_2d_i8_instances(op_ptrs);
} }
} }
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>) else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
{ {
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>) if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{ {
add_device_column_to_image_ndhwc_3d_f32_instances(op_ptrs); add_device_column_to_image_gndhwc_3d_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{
add_device_column_to_image_gndhwc_3d_f16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>)
{
add_device_column_to_image_gndhwc_3d_bf16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{
add_device_column_to_image_gndhwc_3d_i8_instances(op_ptrs);
}
}
else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
{
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{
add_device_column_to_image_nwgc_1d_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{
add_device_column_to_image_nwgc_1d_f16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>)
{
add_device_column_to_image_nwgc_1d_bf16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{
add_device_column_to_image_nwgc_1d_i8_instances(op_ptrs);
}
}
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
{
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{
add_device_column_to_image_nhwgc_2d_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{
add_device_column_to_image_nhwgc_2d_f16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>)
{
add_device_column_to_image_nhwgc_2d_bf16_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{
add_device_column_to_image_nhwgc_2d_i8_instances(op_ptrs);
}
}
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
{
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{
add_device_column_to_image_ndhwgc_3d_f32_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>) else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{ {
add_device_column_to_image_ndhwc_3d_f16_instances(op_ptrs); add_device_column_to_image_ndhwgc_3d_f16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, ck::bhalf_t> && else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<OutDataType, ck::bhalf_t>) is_same_v<OutDataType, ck::bhalf_t>)
{ {
add_device_column_to_image_ndhwc_3d_bf16_instances(op_ptrs); add_device_column_to_image_ndhwgc_3d_bf16_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>) else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
{ {
add_device_column_to_image_ndhwc_3d_i8_instances(op_ptrs); add_device_column_to_image_ndhwgc_3d_i8_instances(op_ptrs);
} }
} }
} }
......
add_instance_library(device_column_to_image_instance add_instance_library(device_column_to_image_instance
device_column_to_image_nhwc_1d_instance.cpp device_column_to_image_gnwc_1d_instance.cpp
device_column_to_image_nhwc_2d_instance.cpp device_column_to_image_gnhwc_2d_instance.cpp
device_column_to_image_nhwc_3d_instance.cpp device_column_to_image_gndhwc_3d_instance.cpp
device_column_to_image_nwgc_1d_instance.cpp
device_column_to_image_nhwgc_2d_instance.cpp
device_column_to_image_ndhwgc_3d_instance.cpp
) )
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_ndhwc_3d_bf16_instances( void add_device_column_to_image_gndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_f16_instances( void add_device_column_to_image_gndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_f32_instances( void add_device_column_to_image_gndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_i8_instances( void add_device_column_to_image_gndhwc_3d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
......
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nhwc_2d_bf16_instances( void add_device_column_to_image_gnhwc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_f16_instances( void add_device_column_to_image_gnhwc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_f32_instances( void add_device_column_to_image_gnhwc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_i8_instances( void add_device_column_to_image_gnhwc_2d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
......
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nwc_1d_bf16_instances( void add_device_column_to_image_gnwc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_f16_instances( void add_device_column_to_image_gnwc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_f32_instances( void add_device_column_to_image_gnwc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_i8_instances( void add_device_column_to_image_gnwc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_ndhwgc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_column_to_image_bf16_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_column_to_image_f16_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_column_to_image_f32_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_column_to_image_i8_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nhwgc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_column_to_image_bf16_instances<2, NHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_nhwgc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_column_to_image_f16_instances<2, NHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_nhwgc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_column_to_image_f32_instances<2, NHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_nhwgc_2d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_column_to_image_i8_instances<2, NHWGC>{});
#else
ignore = instances;
#endif
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nwgc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_column_to_image_bf16_instances<1, NWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_nwgc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_column_to_image_f16_instances<1, NWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_nwgc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_column_to_image_f32_instances<1, NWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_nwgc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_column_to_image_i8_instances<1, NWGC>{});
#else
ignore = instances;
#endif
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
add_instance_library(device_image_to_column_instance add_instance_library(device_image_to_column_instance
device_image_to_column_nhwc_1d_instance.cpp device_image_to_column_gnwc_1d_instance.cpp
device_image_to_column_nhwc_2d_instance.cpp device_image_to_column_gnhwc_2d_instance.cpp
device_image_to_column_nhwc_3d_instance.cpp device_image_to_column_gndhwc_3d_instance.cpp
device_image_to_column_nwgc_1d_instance.cpp
device_image_to_column_nhwgc_2d_instance.cpp
device_image_to_column_ndhwgc_3d_instance.cpp
) )
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_image_to_column_ndhwc_3d_bf16_instances( void add_device_image_to_column_gndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_image_to_column_ndhwc_3d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_image_to_column_ndhwc_3d_bf16_instances(
#endif #endif
} }
void add_device_image_to_column_ndhwc_3d_f16_instances( void add_device_image_to_column_gndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_image_to_column_ndhwc_3d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_image_to_column_ndhwc_3d_f16_instances(
#endif #endif
} }
void add_device_image_to_column_ndhwc_3d_f32_instances( void add_device_image_to_column_gndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_image_to_column_ndhwc_3d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_image_to_column_ndhwc_3d_f32_instances(
#endif #endif
} }
void add_device_image_to_column_ndhwc_3d_i8_instances( void add_device_image_to_column_gndhwc_3d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>& std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
instances) instances)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment