Commit c5335964 authored by root's avatar root
Browse files

added pk_cvt

parents cc9d2a84 f27ea94e
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
constexpr ck::index_t NDimSpatial = 3;
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using OutDataType = ck::half_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::GNDHWC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::GNDHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <typename OutElementOp>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout, OutLayout>,
OutLayout,
InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<OutDataType, OutDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 32, 1, 8>,
8>;
using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
namespace {
// Use custom implementation to pass two more tensors for post op
template <ck::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNDFwdInstance>
bool run_grouped_conv_fwd(bool do_verification,
int init_method,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& in_g_n_c_wis_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
{
constexpr ck::index_t NumDs = 2;
Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
Tensor<OutDataType>(out_g_n_k_wos_desc)};
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
}
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem d0_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
DeviceMem d1_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
d0_buf.ToDevice(d_tensors[0].mData.data());
d1_buf.ToDevice(d_tensors[1].mData.data());
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
const std::array<const void*, NumDs> ds = {d0_buf.GetDeviceBuffer(), d1_buf.GetDeviceBuffer()};
auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
ds,
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
e_g_n_k_wos_lengths, e_g_n_k_wos_lengths},
std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
e_g_n_k_wos_strides, e_g_n_k_wos_strides},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error("The device op with the specified compilation parameters does "
"not support this convolution problem.");
}
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop =
conv_param.GetFlops() + 2 * conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
2 * conv_param.GetOutputByte<OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
NumDs>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in,
wei,
out_host,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op,
d_tensors);
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
}
return true;
}
} // namespace
#include "run_convnd_fwd_activ_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
...@@ -14,11 +14,12 @@ namespace device { ...@@ -14,11 +14,12 @@ namespace device {
/** /**
* \brief Convolution Tensor Rearrange. * \brief Convolution Tensor Rearrange.
* *
* This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to * This Device operator supports converting an image to
* the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and * the GEMM representation (Image to Column) and
* conversion gemm form to the image (Column to Image). * converting a GEMM form to the image (Column to Image).
* * Supported layouts:
* Note that G must be equal to 1. * [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z * Y * X * C]
* [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z * Y * X * C]
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Input Layout. * \tparam ImageLayout Input Layout.
...@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator ...@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
* *
* \param p_in A pointer to the device memory of the input image. * \param p_in A pointer to the device memory of the input image.
* \param p_out A pointer to the device memory of the output. * \param p_out A pointer to the device memory of the output.
* \param G Convolution number of groups.
* \param N Convolution batch size. * \param N Convolution batch size.
* \param C Convolution number of channels. * \param C Convolution number of channels.
* \param input_spatial_lengths Input spatial lengths. * \param input_spatial_lengths Input spatial lengths.
* \param filter_spatial_lengths Filter spatial lengths. * \param filter_spatial_lengths Filter spatial lengths.
* \param output_spatial_lengths Output spatial lengths. * \param output_spatial_lengths Output spatial lengths.
* \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W]. * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
* \param gemm_m_k_strides Gemm form strides. * \param gemm_g_m_k_strides Gemm form strides.
* \param conv_filter_strides Convolution filter strides. * \param conv_filter_strides Convolution filter strides.
* \param conv_filter_dilations Convolution filter dilations. * \param conv_filter_dilations Convolution filter dilations.
* \param input_left_pads Convolution left pads. * \param input_left_pads Convolution left pads.
...@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator ...@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
virtual std::unique_ptr<BaseArgument> virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, MakeArgumentPointer(const void* p_in,
void* p_out, void* p_out,
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
......
...@@ -17,15 +17,18 @@ ...@@ -17,15 +17,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
// Image to column for input layout NDHWC: // Column to Image:
// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C] // input : gemm form [G, N * Do * Ho * Wo, Z * Y * X * C]
// output : image [N, Di, Hi, Wi, C] // output : input image [G, N, Di, Hi, Wi, C]
// input : gemm form [N * Do * Ho * Wo, G, Z * Y * X * C]
// output : input image [N, Di, Hi, Wi, G, C]
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ImageLayout, typename ImageLayout,
typename InputDataType, typename InputDataType,
...@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl ...@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
OutputDataType, OutputDataType,
conv_tensor_rearrange_op::ColumnToImage> conv_tensor_rearrange_op::ColumnToImage>
{ {
static constexpr bool is_NSpatialGC =
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
static constexpr bool is_GNSpatialC =
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl ...@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& independent_filters, const std::array<index_t, NDimSpatial>& independent_filters,
const std::array<index_t, NDimSpatial>& effs) const std::array<index_t, NDimSpatial>& effs)
{ {
...@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl ...@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1]; const index_t NStride = DoHoWo * gemm_g_m_k_strides[I1] * gemm_g_m_k_strides[I2];
// Calculate the appropriate stride for each set of independent filters // Calculate the appropriate stride for each set of independent filters
// in each dimension // in each dimension
const index_t WStride = const index_t WStride = math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) *
math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0]; gemm_g_m_k_strides[I1];
const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) * const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
output_spatial_lengths[XIdx] * gemm_m_k_strides[I0]; output_spatial_lengths[XIdx] * gemm_g_m_k_strides[I1];
const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) * const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
gemm_m_k_strides[I0]; gemm_g_m_k_strides[I1];
// Create descriptor for independent filters in each dimension and // Create descriptor for independent filters in each dimension and
// then merge them into column form // then merge them into column form
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const auto desc_gemm_form = const auto desc_gemm_form =
make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX), make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX),
make_tuple(NStride, WStride, gemm_m_k_strides[I1])); make_tuple(NStride, WStride, gemm_g_m_k_strides[I2]));
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
desc_gemm_form, desc_gemm_form,
make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])), make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])),
...@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl ...@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl
{ {
const auto desc_gemm_form = make_naive_tensor_descriptor( const auto desc_gemm_form = make_naive_tensor_descriptor(
make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX), make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX),
make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1])); make_tuple(NStride, HStride, WStride, gemm_g_m_k_strides[I2]));
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
desc_gemm_form, desc_gemm_form,
make_tuple(make_merge_transform( make_tuple(make_merge_transform(
...@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl ...@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl
independent_filters[YIdx], independent_filters[YIdx],
independent_filters[XIdx], independent_filters[XIdx],
CZYX), CZYX),
make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1])); make_tuple(NStride, DStride, HStride, WStride, gemm_g_m_k_strides[I2]));
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
desc_gemm_form, desc_gemm_form,
make_tuple(make_merge_transform(make_tuple(N, make_tuple(make_merge_transform(make_tuple(N,
...@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl ...@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>( decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
InputGridDesc{}))>; InputGridDesc{}))>;
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc, using GridwiseTensorRearrangeKernel =
InputDataType, GridwiseTensorRearrange<InputGridDesc,
OutputGridDesc, InputDataType,
OutputDataType, OutputGridDesc,
BlockSize, OutputDataType,
MPerBlock, BlockSize,
KPerBlock, MPerBlock,
ThreadClusterLengths, KPerBlock,
ScalarPerVector, ThreadClusterLengths,
InMemoryDataOperationEnum::Add, ScalarPerVector,
Block2ETileMap>; InMemoryDataOperationEnum::Add,
Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const void* p_in, // input image Argument(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads) const std::array<index_t, NDimSpatial>& input_right_pads)
: C_(C), : G_(G),
C_(C),
X_(filter_spatial_lengths[NDimSpatial - I1]), X_(filter_spatial_lengths[NDimSpatial - I1]),
p_in_{static_cast<const InputDataType*>(p_in)}, p_in_{static_cast<const InputDataType*>(p_in)},
p_out_{static_cast<OutputDataType*>(p_out)}, p_out_{static_cast<OutputDataType*>(p_out)},
...@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl ...@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl
input_left_pads_{input_left_pads}, input_left_pads_{input_left_pads},
input_right_pads_{input_right_pads} input_right_pads_{input_right_pads}
{ {
compute_ptr_offset_of_batch_.BatchStrideA_ = gemm_g_m_k_strides[I0];
compute_ptr_offset_of_batch_.BatchStrideC_ = image_g_n_c_wis_strides[I0];
const index_t x_eff = const index_t x_eff =
(filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1; (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
const index_t y_eff = const index_t y_eff =
...@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl ...@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
conv_filter_strides, conv_filter_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
independent_filters, independent_filters,
effs); effs);
const auto out_grid_desc_m_k = const auto out_grid_desc_m_k =
...@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl ...@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl
// Memory offsets to next set of independent filters, // Memory offsets to next set of independent filters,
// move to independent filters in each dimension // move to independent filters in each dimension
const index_t in_offset = const index_t in_offset =
x_idx * gemm_m_k_strides[0] + (x_idx + y_idx * output_spatial_lengths[XIdx] +
y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] + z_idx * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx]) *
z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] * gemm_g_m_k_strides[I1];
output_spatial_lengths[XIdx];
// Move to independent filters in appropriate dimensions // Move to independent filters in appropriate dimensions
const index_t out_offset = const index_t out_offset =
x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] + x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
...@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl ...@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl
} }
} }
const ck::index_t G_;
const ck::index_t C_; const ck::index_t C_;
const ck::index_t X_; const ck::index_t X_;
...@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl ...@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl
std::vector<const InputDataType*> p_in_container_; std::vector<const InputDataType*> p_in_container_;
std::vector<OutputDataType*> p_out_container_; std::vector<OutputDataType*> p_out_container_;
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
...@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl ...@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl
OutputGridDesc, OutputGridDesc,
OutputDataType, OutputDataType,
Block2ETileMap, Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>,
GridwiseTensorRearrangeKernel>; GridwiseTensorRearrangeKernel>;
// Execute each set of independent filters // Execute each set of independent filters
...@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl ...@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>( BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
arg.out_grid_desc_m_k_container_[i]); arg.out_grid_desc_m_k_container_[i]);
const index_t grid_size = const index_t grid_size =
block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]); block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_;
elapsed_time += launch_and_time_kernel(stream_config, elapsed_time += launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(grid_size), dim3(grid_size),
...@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl ...@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl
arg.p_in_container_[i], arg.p_in_container_[i],
arg.out_grid_desc_m_k_container_[i], arg.out_grid_desc_m_k_container_[i],
arg.p_out_container_[i], arg.p_out_container_[i],
block_2_tile_map); arg.G_,
block_2_tile_map,
arg.compute_ptr_offset_of_batch_);
} }
return elapsed_time; return elapsed_time;
} }
...@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl ...@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl
bool IsSupportedArgument(const Argument& arg) bool IsSupportedArgument(const Argument& arg)
{ {
using namespace tensor_layout::convolution; using namespace tensor_layout::convolution;
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> || if constexpr(!(is_NSpatialGC || is_GNSpatialC))
std::is_same_v<ImageLayout, GNDHWC>))
{ {
return false; return false;
} }
...@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl ...@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl
static auto MakeArgument(const void* p_in, // input image static auto MakeArgument(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl ...@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl
{ {
return Argument{static_cast<const InputDataType*>(p_in), return Argument{static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl ...@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl
std::unique_ptr<BaseArgument> std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, // input image MakeArgumentPointer(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl ...@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl
{ {
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in), return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -184,7 +184,8 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout, ...@@ -184,7 +184,8 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
return false; return false;
} }
} }
else if(ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940") else if(ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940" ||
ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942")
{ {
if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> || if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>)) is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
......
...@@ -15,15 +15,18 @@ ...@@ -15,15 +15,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
// Image to column for input layout NDHWC: // Image to column:
// input : input image [N, Di, Hi, Wi, C] // input : input image [G, N, Di, Hi, Wi, C]
// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C] // output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
// input : input image [N, Di, Hi, Wi, G, C]
// output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ImageLayout, typename ImageLayout,
typename InputDataType, typename InputDataType,
...@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl ...@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
OutputDataType, OutputDataType,
conv_tensor_rearrange_op::ImageToColumn> conv_tensor_rearrange_op::ImageToColumn>
{ {
static constexpr bool is_NSpatialGC =
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
static constexpr bool is_GNSpatialC =
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl ...@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, 2>& gemm_m_k_strides) const std::array<index_t, 3>& gemm_g_m_k_strides)
{ {
const index_t NDoHoWo = const index_t NDoHoWo =
N * ck::accumulate_n<index_t>( N * ck::accumulate_n<index_t>(
...@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl ...@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl
const index_t CZYX = const index_t CZYX =
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1]));
const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw); const auto desc_mraw_kraw = make_naive_tensor_descriptor(
return desc_m_k; make_tuple(NDoHoWo, CZYX), make_tuple(gemm_g_m_k_strides[I1], gemm_g_m_k_strides[I2]));
return matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
} }
using InputGridDesc = using InputGridDesc =
...@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl ...@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>( decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
OutputGridDesc{}))>; OutputGridDesc{}))>;
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc, using GridwiseTensorRearrangeKernel =
InputDataType, GridwiseTensorRearrange<InputGridDesc,
OutputGridDesc, InputDataType,
OutputDataType, OutputGridDesc,
BlockSize, OutputDataType,
MPerBlock, BlockSize,
KPerBlock, MPerBlock,
ThreadClusterLengths, KPerBlock,
ScalarPerVector, ThreadClusterLengths,
InMemoryDataOperationEnum::Set, ScalarPerVector,
Block2ETileMap>; InMemoryDataOperationEnum::Set,
Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const void* p_in, // input image Argument(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads) const std::array<index_t, NDimSpatial>& input_right_pads)
: C_(C), : G_(G),
C_(C),
X_(filter_spatial_lengths[NDimSpatial - I1]), X_(filter_spatial_lengths[NDimSpatial - I1]),
p_in_{static_cast<const InputDataType*>(p_in)}, p_in_{static_cast<const InputDataType*>(p_in)},
p_out_{static_cast<OutputDataType*>(p_out)}, p_out_{static_cast<OutputDataType*>(p_out)},
...@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl ...@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
out_grid_desc_m_k_ = MakeOutDescriptor_M_K( out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides); N, C, filter_spatial_lengths, output_spatial_lengths, gemm_g_m_k_strides);
compute_ptr_offset_of_batch_.BatchStrideA_ = image_g_n_c_wis_strides[I0];
compute_ptr_offset_of_batch_.BatchStrideC_ = gemm_g_m_k_strides[I0];
} }
void Print() const void Print() const
...@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl ...@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl
std::cout << out_grid_desc_m_k_ << std::endl; std::cout << out_grid_desc_m_k_ << std::endl;
} }
const ck::index_t G_;
const ck::index_t C_; const ck::index_t C_;
const ck::index_t X_; const ck::index_t X_;
...@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl ...@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl
InputGridDesc in_grid_desc_m_k_; InputGridDesc in_grid_desc_m_k_;
OutputGridDesc out_grid_desc_m_k_; OutputGridDesc out_grid_desc_m_k_;
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
...@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl ...@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl
const auto block_2_tile_map = const auto block_2_tile_map =
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>( BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
arg.out_grid_desc_m_k_); arg.out_grid_desc_m_k_);
const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_); const index_t grid_size =
const auto kernel = kernel_tensor_rearrange<InputGridDesc, block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_) * arg.G_;
const auto kernel = kernel_tensor_rearrange<InputGridDesc,
InputDataType, InputDataType,
OutputGridDesc, OutputGridDesc,
OutputDataType, OutputDataType,
Block2ETileMap, Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>,
GridwiseTensorRearrangeKernel>; GridwiseTensorRearrangeKernel>;
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
...@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl ...@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl
arg.p_in_, arg.p_in_,
arg.out_grid_desc_m_k_, arg.out_grid_desc_m_k_,
arg.p_out_, arg.p_out_,
block_2_tile_map); arg.G_,
block_2_tile_map,
arg.compute_ptr_offset_of_batch_);
return elapsed_time; return elapsed_time;
} }
...@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl ...@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl
bool IsSupportedArgument(const Argument& arg) bool IsSupportedArgument(const Argument& arg)
{ {
using namespace tensor_layout::convolution; if constexpr(!(is_NSpatialGC || is_GNSpatialC))
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
std::is_same_v<ImageLayout, GNDHWC>))
{ {
return false; return false;
} }
...@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl ...@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl
static auto MakeArgument(const void* p_in, // input image static auto MakeArgument(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl ...@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl
{ {
return Argument{static_cast<const InputDataType*>(p_in), return Argument{static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl ...@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl
std::unique_ptr<BaseArgument> std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, // input image MakeArgumentPointer(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl ...@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl
{ {
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in), return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -311,6 +311,71 @@ struct AddAddFastGelu ...@@ -311,6 +311,71 @@ struct AddAddFastGelu
} }
}; };
// E = Relu(alpha1 * C + alpha2 * D0 + D1)
struct ScaleAddScaleAddRelu
{
ScaleAddScaleAddRelu(const float alpha1 = 1.f, const float alpha2 = 1.f)
: alpha1_(alpha1), alpha2_(alpha2)
{
}
template <typename E, typename C, typename D0, typename D1>
__host__ __device__ constexpr void
operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
template <>
__host__ __device__ constexpr void operator()<float, float, float, float>(float& e,
const float& c,
const float& d0,
const float& d1) const
{
const float x = c * alpha1_ + alpha2_ * d0 + d1;
Relu{}.template operator()<float>(e, x);
}
template <>
__host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
half_t& e, const half_t& c, const half_t& d0, const half_t& d1) const
{
const float x = type_convert<float>(c) * alpha1_ + alpha2_ * type_convert<float>(d0) +
type_convert<float>(d1);
float result = 0;
Relu{}.template operator()<float>(result, x);
e = type_convert<half_t>(result);
}
template <>
__host__ __device__ constexpr void operator()<bhalf_t, bhalf_t, bhalf_t, bhalf_t>(
bhalf_t& e, const bhalf_t& c, const bhalf_t& d0, const bhalf_t& d1) const
{
const float x = type_convert<float>(c) * alpha1_ + alpha2_ * type_convert<float>(d0) +
type_convert<float>(d1);
float result = 0;
Relu{}.template operator()<float>(result, x);
e = type_convert<bhalf_t>(result);
}
template <>
__host__ __device__ constexpr void operator()<int8_t, int8_t, float, float>(
int8_t& e, const int8_t& c, const float& d0, const float& d1) const
{
const float x = type_convert<float>(c) * alpha1_ + alpha2_ * d0 + d1;
float result = 0;
Relu{}.template operator()<float>(result, x);
e = type_convert<int8_t>(result);
}
const float alpha1_;
const float alpha2_;
};
struct Normalize struct Normalize
{ {
// FIXME: is double absolutely necessary? // FIXME: is double absolutely necessary?
......
...@@ -29,8 +29,8 @@ struct PassThrough ...@@ -29,8 +29,8 @@ struct PassThrough
__host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::f8x2_t& x) const __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::f8x2_t& x) const
{ {
int32_t t = ck::bit_cast<uint16_t>(x); auto t = type_convert<float2_t>(x);
y = ck::bit_cast<ck::half2_t>(t); y = type_convert<half2_t>(t);
} }
__host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::half2_t& x) const __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::half2_t& x) const
......
...@@ -21,6 +21,7 @@ template <typename InputGridDesc, ...@@ -21,6 +21,7 @@ template <typename InputGridDesc,
typename OutputGridDesc, typename OutputGridDesc,
typename OutputDataType, typename OutputDataType,
typename Block2ETileMap, typename Block2ETileMap,
typename ComputePtrOffsetOfStridedBatch,
typename GridwiseTensorRearrangeKernel> typename GridwiseTensorRearrangeKernel>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
...@@ -30,13 +31,20 @@ __global__ void ...@@ -30,13 +31,20 @@ __global__ void
const InputDataType* __restrict__ p_in_global, const InputDataType* __restrict__ p_in_global,
const OutputGridDesc out_grid_desc, const OutputGridDesc out_grid_desc,
OutputDataType* __restrict__ p_out_global, OutputDataType* __restrict__ p_out_global,
const Block2ETileMap block_2_tile_map) const index_t batch_count,
const Block2ETileMap block_2_tile_map,
const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \ defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__)) defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseTensorRearrangeKernel::Run( GridwiseTensorRearrangeKernel::Run(in_grid_desc,
in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map); p_in_global,
out_grid_desc,
p_out_global,
batch_count,
block_2_tile_map,
compute_ptr_offset_of_batch);
#else #else
ignore = in_grid_desc; ignore = in_grid_desc;
ignore = p_in_global; ignore = p_in_global;
...@@ -56,7 +64,8 @@ template <typename InputGridDesc, ...@@ -56,7 +64,8 @@ template <typename InputGridDesc,
typename ThreadClusterLengths, typename ThreadClusterLengths,
index_t ScalarPerVector, index_t ScalarPerVector,
InMemoryDataOperationEnum DstInMemOp, InMemoryDataOperationEnum DstInMemOp,
typename Block2ETileMap> typename Block2ETileMap,
typename ComputePtrOffsetOfStridedBatch>
struct GridwiseTensorRearrange struct GridwiseTensorRearrange
{ {
...@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange ...@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
const InputDataType* __restrict__ p_in_global, const InputDataType* __restrict__ p_in_global,
const OutputGridDesc& out_grid_desc, const OutputGridDesc& out_grid_desc,
OutputDataType* __restrict__ p_out_global, OutputDataType* __restrict__ p_out_global,
const Block2ETileMap& block_2_tile_map) const index_t batch_count,
const Block2ETileMap& block_2_tile_map,
const ComputePtrOffsetOfStridedBatch& compute_ptr_offset_of_batch)
{ {
const auto block_work_idx = const auto block_work_idx =
block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
...@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange ...@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
const index_t k_block_data_idx_on_grid = const index_t k_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock); __builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock);
// Global Memory
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global, in_grid_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global, out_grid_desc.GetElementSpaceSize());
auto copy_global_to_global = auto copy_global_to_global =
ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock, ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock,
Tuple<InputDataType>, Tuple<InputDataType>,
...@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange ...@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
tensor_operation::element_wise::PassThrough{}}; tensor_operation::element_wise::PassThrough{}};
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
const index_t g_idx =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
// Global Memory
const index_t a_batch_offset =
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
const index_t c_batch_offset =
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
copy_global_to_global.Run( copy_global_to_global.Run(
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf)); tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
} }
......
...@@ -100,6 +100,8 @@ template <> ...@@ -100,6 +100,8 @@ template <>
inline __host__ __device__ f8_t type_convert<f8_t, float>(float x) inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
{ {
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
float max_fp8 = 240.0f;
x = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
union union
{ {
float fval; float fval;
...@@ -138,6 +140,33 @@ inline __host__ __device__ float type_convert<float, f8_t>(f8_t x) ...@@ -138,6 +140,33 @@ inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
#endif #endif
} }
template <>
inline __host__ __device__ float2_t type_convert<float2_t, f8x2_t>(f8x2_t x)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
f8x2_v.template AsType<f8_t>()[Number<0>{}]
return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, 0);
#else
constexpr bool negative_zero_nan = true;
const auto f8x2_v = vector_type<f8_t, 2>(x);
vector_type<float, 2> f32x2_v;
f32x2_v.template AsType<float>()(Number<0>{}) = utils::cast_from_f8<f8_t, float, negative_zero_nan>(f8x2_v.template AsType<f8_t>()[Number<0>{}]);
f32x2_v.template AsType<float>()(Number<1>{}) = utils::cast_from_f8<f8_t, float, negative_zero_nan>(f8x2_v.template AsType<f8_t>()[Number<1>{}]);
return f32x2_v.template AsType<float2_t>()[Number<0>{}];
#endif
}
template <>
inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
{
const vector_type<float, 2> f32x2_v(x);
const auto y = __builtin_amdgcn_cvt_pkrtz(f32x2_v.template AsType<float>()[Number<0>{}], f32x2_v.template AsType<float>()[Number<1>{}]);
return bit_cast<half2_t>(y);
}
// convert fp16 to fp8 // convert fp16 to fp8
template <> template <>
inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x) inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x)
......
...@@ -19,9 +19,7 @@ namespace host { ...@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for column to image. * \brief Reference implementation for column to image.
* *
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout. * Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. * Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout. * \tparam ImageLayout Image Layout.
...@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 && if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 &&
arg.input_.GetNumOfDimension() == 2)) arg.input_.GetNumOfDimension() == 3))
{ {
throw std::runtime_error("wrong! inconsistent dimension"); throw std::runtime_error("wrong! inconsistent dimension");
} }
const index_t G = arg.output_.GetLengths()[0];
const index_t N = arg.output_.GetLengths()[1]; const index_t N = arg.output_.GetLengths()[1];
const index_t C = arg.output_.GetLengths()[2]; const index_t C = arg.output_.GetLengths()[2];
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const index_t Wo = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[0];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t wo = 0; wo < Wo; ++wo) for(index_t wo = 0; wo < Wo; ++wo)
{ {
index_t row = n * Wo + wo; index_t row = n * Wo + wo;
...@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
if(wi >= 0 && if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3]) ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3])
{ {
float v_in = ck::type_convert<float>(arg.input_(row, column)); float v_in =
float v_out = ck::type_convert<float>(arg.output_(0, n, c, wi)); ck::type_convert<float>(arg.input_(g, row, column));
arg.output_(0, n, c, wi) = float v_out = ck::type_convert<float>(arg.output_(g, n, c, wi));
arg.output_(g, n, c, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Ho = arg.output_spatial_lengths_[0];
const index_t Wo = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[1];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t ho = 0; ho < Ho; ++ho) for(index_t ho = 0; ho < Ho; ++ho)
{ {
for(index_t wo = 0; wo < Wo; ++wo) for(index_t wo = 0; wo < Wo; ++wo)
...@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
arg.output_.GetLengths()[4]) arg.output_.GetLengths()[4])
{ {
float v_in = float v_in =
ck::type_convert<float>(arg.input_(row, column)); ck::type_convert<float>(arg.input_(g, row, column));
float v_out = ck::type_convert<float>( float v_out = ck::type_convert<float>(
arg.output_(0, n, c, hi, wi)); arg.output_(g, n, c, hi, wi));
arg.output_(0, n, c, hi, wi) = arg.output_(g, n, c, hi, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Ho = arg.output_spatial_lengths_[1];
const index_t Wo = arg.output_spatial_lengths_[2]; const index_t Wo = arg.output_spatial_lengths_[2];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t d_o = 0; d_o < Do; ++d_o) for(index_t d_o = 0; d_o < Do; ++d_o)
{ {
for(index_t ho = 0; ho < Ho; ++ho) for(index_t ho = 0; ho < Ho; ++ho)
...@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
arg.output_.GetLengths()[5]) arg.output_.GetLengths()[5])
{ {
float v_in = ck::type_convert<float>( float v_in = ck::type_convert<float>(
arg.input_(row, column)); arg.input_(g, row, column));
float v_out = ck::type_convert<float>( float v_out = ck::type_convert<float>(
arg.output_(0, n, c, di, hi, wi)); arg.output_(g, n, c, di, hi, wi));
arg.output_(0, n, c, di, hi, wi) = arg.output_(g, n, c, di, hi, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) && if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(G) &&
arg.input_.GetLengths()[1] == static_cast<std::size_t>(CZYX))) arg.input_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
arg.input_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
{ {
return false; return false;
} }
......
...@@ -42,6 +42,7 @@ template <ck::index_t NDimSpatial, ...@@ -42,6 +42,7 @@ template <ck::index_t NDimSpatial,
typename InElementwiseOperation, typename InElementwiseOperation,
typename WeiElementwiseOperation, typename WeiElementwiseOperation,
typename OutElementwiseOperation, typename OutElementwiseOperation,
ck::index_t NumDTensor = 0,
typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false> typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
struct ReferenceConvFwd : public device::BaseOperator struct ReferenceConvFwd : public device::BaseOperator
{ {
...@@ -57,10 +58,12 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -57,10 +58,12 @@ struct ReferenceConvFwd : public device::BaseOperator
std::vector<ck::index_t> input_right_pads, std::vector<ck::index_t> input_right_pads,
InElementwiseOperation in_element_op, InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op, WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op) OutElementwiseOperation out_element_op,
const std::array<Tensor<OutDataType>, NumDTensor>& d_tensors)
: input_{input}, : input_{input},
weight_{weight}, weight_{weight},
output_{output}, output_{output},
d_tensors_{d_tensors},
conv_strides_{conv_filter_strides}, conv_strides_{conv_filter_strides},
conv_dilations_{conv_filter_dilations}, conv_dilations_{conv_filter_dilations},
in_left_pads_{input_left_pads}, in_left_pads_{input_left_pads},
...@@ -75,6 +78,8 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -75,6 +78,8 @@ struct ReferenceConvFwd : public device::BaseOperator
const Tensor<WeiDataType>& weight_; const Tensor<WeiDataType>& weight_;
Tensor<OutDataType>& output_; Tensor<OutDataType>& output_;
const std::array<Tensor<OutDataType>, NumDTensor>& d_tensors_;
std::vector<index_t> conv_strides_; std::vector<index_t> conv_strides_;
std::vector<index_t> conv_dilations_; std::vector<index_t> conv_dilations_;
std::vector<index_t> in_left_pads_; std::vector<index_t> in_left_pads_;
...@@ -129,7 +134,26 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -129,7 +134,26 @@ struct ReferenceConvFwd : public device::BaseOperator
} }
OutDataType v_out; OutDataType v_out;
arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc)); OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
if constexpr(NumDTensor == 0)
{
arg.out_element_op_(v_out, v_acc_converted);
}
else if constexpr(NumDTensor == 1)
{
arg.out_element_op_(v_out, v_acc_converted, arg.d_tensors_[0](g, n, k, wo));
}
else if constexpr(NumDTensor == 2)
{
arg.out_element_op_(v_out,
v_acc_converted,
arg.d_tensors_[0](g, n, k, wo),
arg.d_tensors_[1](g, n, k, wo));
}
else
{
throw std::runtime_error("Output ElementOp not supported in reference.");
}
arg.output_(g, n, k, wo) = v_out; arg.output_(g, n, k, wo) = v_out;
}; };
...@@ -183,7 +207,27 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -183,7 +207,27 @@ struct ReferenceConvFwd : public device::BaseOperator
} }
OutDataType v_out; OutDataType v_out;
arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc)); OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
if constexpr(NumDTensor == 0)
{
arg.out_element_op_(v_out, v_acc_converted);
}
else if constexpr(NumDTensor == 1)
{
arg.out_element_op_(
v_out, v_acc_converted, arg.d_tensors_[0](g, n, k, ho, wo));
}
else if constexpr(NumDTensor == 2)
{
arg.out_element_op_(v_out,
v_acc_converted,
arg.d_tensors_[0](g, n, k, ho, wo),
arg.d_tensors_[1](g, n, k, ho, wo));
}
else
{
throw std::runtime_error("Output ElementOp not supported in reference.");
}
arg.output_(g, n, k, ho, wo) = v_out; arg.output_(g, n, k, ho, wo) = v_out;
}; };
...@@ -250,7 +294,27 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -250,7 +294,27 @@ struct ReferenceConvFwd : public device::BaseOperator
} }
OutDataType v_out; OutDataType v_out;
arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc)); OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
if constexpr(NumDTensor == 0)
{
arg.out_element_op_(v_out, v_acc_converted);
}
else if constexpr(NumDTensor == 1)
{
arg.out_element_op_(
v_out, v_acc_converted, arg.d_tensors_[0](g, n, k, d_o, ho, wo));
}
else if constexpr(NumDTensor == 2)
{
arg.out_element_op_(v_out,
v_acc_converted,
arg.d_tensors_[0](g, n, k, d_o, ho, wo),
arg.d_tensors_[1](g, n, k, d_o, ho, wo));
}
else
{
throw std::runtime_error("Output ElementOp not supported in reference.");
}
arg.output_(g, n, k, d_o, ho, wo) = v_out; arg.output_(g, n, k, d_o, ho, wo) = v_out;
}; };
...@@ -294,7 +358,8 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -294,7 +358,8 @@ struct ReferenceConvFwd : public device::BaseOperator
std::vector<ck::index_t> input_right_pads, std::vector<ck::index_t> input_right_pads,
InElementwiseOperation in_element_op, InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op, WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op) OutElementwiseOperation out_element_op,
const std::array<Tensor<OutDataType>, NumDTensor>& d_tensors = {})
{ {
return Argument{input, return Argument{input,
weight, weight,
...@@ -305,7 +370,8 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -305,7 +370,8 @@ struct ReferenceConvFwd : public device::BaseOperator
input_right_pads, input_right_pads,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op}; out_element_op,
d_tensors};
} }
static auto MakeInvoker() { return Invoker{}; } static auto MakeInvoker() { return Invoker{}; }
......
...@@ -19,9 +19,7 @@ namespace host { ...@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for image to column. * \brief Reference implementation for image to column.
* *
* Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C]. * Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout. * \tparam ImageLayout Image Layout.
...@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 && if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
arg.output_.GetNumOfDimension() == 2)) arg.output_.GetNumOfDimension() == 3))
{ {
throw std::runtime_error("wrong! inconsistent dimension"); throw std::runtime_error("wrong! inconsistent dimension");
} }
const index_t G = arg.input_.GetLengths()[0];
const index_t N = arg.input_.GetLengths()[1]; const index_t N = arg.input_.GetLengths()[1];
const index_t C = arg.input_.GetLengths()[2]; const index_t C = arg.input_.GetLengths()[2];
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const index_t Wo = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[0];
auto func = [&](auto n, auto wo) { auto func = [&](auto g, auto n, auto wo) {
index_t row = n * Wo + wo; index_t row = n * Wo + wo;
index_t column = 0; index_t column = 0;
...@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
if(wi >= 0 && if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
{ {
InDataType v_in = arg.input_(0, n, c, wi); InDataType v_in = arg.input_(g, n, c, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(g, row, column) = ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
} }
} }
}; };
make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Ho = arg.output_spatial_lengths_[0];
const index_t Wo = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[1];
auto func = [&](auto n, auto ho, auto wo) { auto func = [&](auto g, auto n, auto ho, auto wo) {
index_t row = n * Ho * Wo + ho * Wo + wo; index_t row = n * Ho * Wo + ho * Wo + wo;
index_t column = 0; index_t column = 0;
...@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
wi >= 0 && wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
{ {
InDataType v_in = arg.input_(0, n, c, hi, wi); InDataType v_in = arg.input_(g, n, c, hi, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(g, row, column) =
ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
} }
...@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Ho = arg.output_spatial_lengths_[1];
const index_t Wo = arg.output_spatial_lengths_[2]; const index_t Wo = arg.output_spatial_lengths_[2];
auto func = [&](auto n, auto d_o, auto ho, auto wo) { auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo; index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
index_t column = 0; index_t column = 0;
...@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
ck::type_convert<std::size_t>(wi) < ck::type_convert<std::size_t>(wi) <
arg.input_.GetLengths()[5]) arg.input_.GetLengths()[5])
{ {
InDataType v_in = arg.input_(0, n, c, di, hi, wi); InDataType v_in = arg.input_(g, n, c, di, hi, wi);
arg.output_(row, column) = arg.output_(g, row, column) =
ck::type_convert<OutDataType>(v_in); ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
...@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N, Do, Ho, Wo)( make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
...@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) && if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(G) &&
arg.output_.GetLengths()[1] == static_cast<std::size_t>(CZYX))) arg.output_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
arg.output_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
{ {
return false; return false;
} }
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
#ifdef CK_ENABLE_BF16
// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
ck::Tuple<NDHWGK, NDHWGK>,
NDHWGK,
BF16,
BF16,
ck::Tuple<BF16, BF16>,
BF16,
PassThrough,
PassThrough,
ScaleAddScaleAddRelu>>>& instances);
#endif
#ifdef CK_ENABLE_FP16
void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
ck::Tuple<NDHWGK, NDHWGK>,
NDHWGK,
F16,
F16,
ck::Tuple<F16, F16>,
F16,
PassThrough,
PassThrough,
ScaleAddScaleAddRelu>>>& instances);
#endif
#ifdef CK_ENABLE_FP32
void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
ck::Tuple<NDHWGK, NDHWGK>,
NDHWGK,
F32,
F32,
ck::Tuple<F32, F32>,
F32,
PassThrough,
PassThrough,
ScaleAddScaleAddRelu>>>& instances);
#endif
#ifdef CK_ENABLE_INT8
void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
ck::Tuple<NDHWGK, NDHWGK>,
NDHWGK,
int8_t,
int8_t,
ck::Tuple<F32, F32>,
int8_t,
PassThrough,
PassThrough,
ScaleAddScaleAddRelu>>>& instances);
#endif
template <ck::index_t NumDimSpatial,
typename InLayout,
typename WeiLayout,
typename DLayouts,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename DDataTypes,
typename OutDataType,
typename ComputeType>
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial,
InLayout,
WeiLayout,
DLayouts,
OutLayout,
InDataType,
WeiDataType,
DDataTypes,
OutDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::ScaleAddScaleAddRelu,
ComputeType>>
{
using DeviceOp =
DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout,
WeiLayout,
DLayouts,
OutLayout,
InDataType,
WeiDataType,
DDataTypes,
OutDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::ScaleAddScaleAddRelu,
ComputeType>;
static auto GetInstances()
{
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
{
#ifdef CK_ENABLE_FP32
if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
is_same_v<OutDataType, float>)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances(
op_ptrs);
}
#endif
#ifdef CK_ENABLE_FP16
if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances(
op_ptrs);
}
#endif
#ifdef CK_ENABLE_BF16
if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
op_ptrs);
}
#endif
#ifdef CK_ENABLE_INT8
if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
is_same_v<OutDataType, int8_t>)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances(
op_ptrs);
}
#endif
}
return op_ptrs;
}
};
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
add_instance_library(device_column_to_image_instance add_instance_library(device_column_to_image_instance
device_column_to_image_nhwc_1d_instance.cpp device_column_to_image_gnwc_1d_instance.cpp
device_column_to_image_nhwc_2d_instance.cpp device_column_to_image_gnhwc_2d_instance.cpp
device_column_to_image_nhwc_3d_instance.cpp device_column_to_image_gndhwc_3d_instance.cpp
device_column_to_image_nwgc_1d_instance.cpp
device_column_to_image_nhwgc_2d_instance.cpp
device_column_to_image_ndhwgc_3d_instance.cpp
) )
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_ndhwc_3d_bf16_instances( void add_device_column_to_image_gndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_f16_instances( void add_device_column_to_image_gndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_f32_instances( void add_device_column_to_image_gndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_i8_instances( void add_device_column_to_image_gndhwc_3d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
......
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nhwc_2d_bf16_instances( void add_device_column_to_image_gnhwc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_f16_instances( void add_device_column_to_image_gnhwc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_f32_instances( void add_device_column_to_image_gnhwc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_i8_instances( void add_device_column_to_image_gnhwc_2d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
......
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nwc_1d_bf16_instances( void add_device_column_to_image_gnwc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_f16_instances( void add_device_column_to_image_gnwc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_f32_instances( void add_device_column_to_image_gnwc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_i8_instances( void add_device_column_to_image_gnwc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_ndhwgc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_column_to_image_bf16_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_column_to_image_f16_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_column_to_image_f32_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_column_to_image_i8_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment