"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "557d7aa343261874b934980da964eebea546cad1"
Commit 561e787e authored by Bartlomiej Kocot's avatar Bartlomiej Kocot
Browse files

Add instances, tests, profiler, example

parent 9f008852
...@@ -10,11 +10,11 @@ using InLayout = ck::tensor_layout::convolution::GNHWC; ...@@ -10,11 +10,11 @@ using InLayout = ck::tensor_layout::convolution::GNHWC;
// clang-format off // clang-format off
using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
// ######| NDimSpatial| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl| CDEBlockTransfer| CDEBlockTransfer| //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
// ######| | | | | | Type| Type| Type| DataType| Type| Type| Operation| Operation| Operation| DataSpecialization| GemmM| GemmN| PrefetchStage| Size| Block| Block| Block| | | XDL| XDL| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| PerWave| PerWave| _MBlock_MPerBlock| ScalarPerVector| //#####################| Dim| | | | Size| Block| Block| Cluster| Per|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | Lengths_AK0_M_AK1| ArrangeOrder| | | PerVector| PerVector_AK1| | Lengths_BK0_N_BK1| ArrangeOrder| | | PerVector| PerVector_BK1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| //#####################| Spatial| | | | | | | Lengths| Vector|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //#####################| | | | | | | | | |
< NDimSpatial, InLayout, InDataType, OutDataType, 256, 128, 128, S<128, 128>, S<16, 16>, 4>; < NDimSpatial, InLayout, InDataType, OutDataType, 256, 128, 128, S<16, 16>, 1>;
// clang-format on // clang-format on
bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/host_utility/io.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename InputGridDesc,
typename InputDataType,
typename OutputGridDesc,
typename OutputDataType,
typename Block2ETileMap,
typename GridwiseImageToColumnKernel>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_image_to_column(const InputGridDesc in_grid_desc,
const InputDataType* __restrict__ p_in_global,
const OutputGridDesc out_grid_desc,
OutputDataType* __restrict__ p_out_global,
const Block2ETileMap block_2_tile_map)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseImageToColumnKernel::Run(
in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map);
#else
ignore = in_grid_desc;
ignore = p_in_global;
ignore = out_grid_desc;
ignore = p_out_global;
ignore = block_2_tile_map;
#endif
}
// Image to column for input layout NDHWC:
// input : input image [N, Di, Hi, Wi, C],
// output : output image [N * Do * Ho * Wo, Z * Y * X * C]
template <index_t NDimSpatial,
typename InputLayout,
typename InputDataType,
typename OutputDataType,
index_t BlockSize,
index_t MPerBlock,
index_t KPerBlock,
typename ThreadClusterLengths,
index_t ScalarPerVector>
struct DeviceImageToColumnImpl
: public DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>
{
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto conv_to_gemm_transformer =
TransformConvFwdToGemm<NDimSpatial, ConvolutionForwardSpecialization::Default>{};
static constexpr auto matrix_padder =
MatrixPadder<GemmSpecialization::MKPadding, index_t, index_t, index_t>{
MPerBlock, 0 /* NPerBlock*/, KPerBlock};
// Use MakeADescriptor_M_K from grouped convolution forward
static auto
MakeInputDescriptor_M_K(const ck::index_t N,
const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads)
{
std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{1};
std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{1};
std::array<index_t, NDimSpatial + 3> c_g_n_k_wos_lengths{1};
auto copy = [](const auto& x, auto& y, index_t dst_offset) {
std::copy(x.begin(), x.end(), y.begin() + dst_offset);
};
constexpr index_t spatial_offset = 3;
copy(input_spatial_lengths, a_g_n_c_wis_lengths, spatial_offset);
copy(filter_spatial_lengths, b_g_k_c_xs_lengths, spatial_offset);
copy(output_spatial_lengths, c_g_n_k_wos_lengths, spatial_offset);
// fill only significant values (C and N)
a_g_n_c_wis_lengths[I1] = N;
a_g_n_c_wis_lengths[I2] = C;
b_g_k_c_xs_lengths[I2] = C;
c_g_n_k_wos_lengths[I1] = N;
const auto in_gemmmraw_gemmkraw_desc =
conv_to_gemm_transformer.template MakeADescriptor_M_K<InputLayout>(
a_g_n_c_wis_lengths,
input_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
{}, // not needed for A Descriptor
c_g_n_k_wos_lengths,
{}, // not needed for A Descriptor
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
const auto in_gemmm_gemmk_desc =
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
return in_gemmm_gemmk_desc;
}
static auto
MakeOutDescriptor_M_K(const ck::index_t N,
const ck::index_t C,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, 2>& output_m_k_strides)
{
const index_t NDoHoWo =
N * ck::accumulate_n<index_t>(
output_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const index_t CZYX =
C * ck::accumulate_n<index_t>(
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
make_tuple(NDoHoWo, CZYX), make_tuple(output_m_k_strides[I0], output_m_k_strides[I1]));
const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
return desc_m_k;
}
using InputGridDesc =
remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, {}, {}, {}, {}, {}, {}, {}, {}))>;
using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K(1, 1, {}, {}, {}))>;
using Block2ETileMap = remove_cvref_t<decltype(
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(OutputGridDesc{}))>;
using GridwiseImageToColumnKernel = GridwiseImageToColumn<InputGridDesc,
InputDataType,
OutputGridDesc,
OutputDataType,
BlockSize,
MPerBlock,
KPerBlock,
ThreadClusterLengths,
ScalarPerVector,
Block2ETileMap>;
struct Argument : public BaseArgument
{
Argument(const void* p_in, // input image
void* p_out, // output image
const ck::index_t N,
const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
const std::array<index_t, 2>& output_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads)
: C_(C),
X_(filter_spatial_lengths[NDimSpatial - I1]),
p_in_{static_cast<const InputDataType*>(p_in)},
p_out_{static_cast<OutputDataType*>(p_out)},
input_g_n_c_wis_strides_{input_g_n_c_wis_strides},
conv_filter_strides_{conv_filter_strides},
conv_filter_dilations_{conv_filter_dilations},
input_left_pads_{input_left_pads},
input_right_pads_{input_right_pads}
{
in_grid_desc_m_k_ = MakeInputDescriptor_M_K(N,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
N, C, filter_spatial_lengths, output_spatial_lengths, output_m_k_strides);
}
void Print() const
{
std::cout << in_grid_desc_m_k_ << std::endl;
std::cout << out_grid_desc_m_k_ << std::endl;
}
const ck::index_t C_;
const ck::index_t X_;
const InputDataType* p_in_;
OutputDataType* p_out_;
const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides_;
const std::array<index_t, NDimSpatial>& conv_filter_strides_;
const std::array<index_t, NDimSpatial>& conv_filter_dilations_;
const std::array<index_t, NDimSpatial>& input_left_pads_;
const std::array<index_t, NDimSpatial>& input_right_pads_;
InputGridDesc in_grid_desc_m_k_;
OutputGridDesc out_grid_desc_m_k_;
};
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
if(stream_config.log_level_ > 0)
{
arg.Print();
}
const auto block_2_tile_map =
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
arg.out_grid_desc_m_k_, I1 /*M01*/);
const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_);
const auto kernel = kernel_image_to_column<InputGridDesc,
InputDataType,
OutputGridDesc,
OutputDataType,
Block2ETileMap,
GridwiseImageToColumnKernel>;
float elapsed_time = launch_and_time_kernel(stream_config,
kernel,
dim3(grid_size),
dim3(BlockSize),
0,
arg.in_grid_desc_m_k_,
arg.p_in_,
arg.out_grid_desc_m_k_,
arg.p_out_,
block_2_tile_map);
return elapsed_time;
}
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
bool IsSupportedArgument(const Argument& arg)
{
using namespace tensor_layout::convolution;
if(!(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
std::is_same_v<InputLayout, GNDHWC>))
{
return false;
}
if(!(NDimSpatial >= 1 && NDimSpatial <= 3))
{
return false;
}
const auto x_pad_left = arg.input_left_pads_[NDimSpatial - I1];
const auto x_pad_right = arg.input_right_pads_[NDimSpatial - I1];
const auto dilation_x = arg.conv_filter_dilations_[NDimSpatial - I1];
const auto stride_x = arg.conv_filter_strides_[NDimSpatial - I1];
bool is_c_packed = arg.input_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_;
// check vector access of filter window row (only C if C is not packed)
if(!is_c_packed && arg.C_ % ScalarPerVector != 0)
return false;
// check vector access of filter window row (X * C)
if(arg.X_ * arg.C_ % ScalarPerVector != 0)
return false;
// check vector access of pads (x_pad_left/x_pad_right * C)
if(x_pad_left * arg.C_ % ScalarPerVector != 0 ||
x_pad_right * arg.C_ % ScalarPerVector != 0)
return false;
// check vector access of with stride and pad
if((x_pad_left != 0 || x_pad_right != 0) && stride_x > 1 && arg.C_ % ScalarPerVector != 0)
return false;
// check vector access of with dilation
if(dilation_x > 1 && arg.C_ % ScalarPerVector != 0)
return false;
return GridwiseImageToColumnKernel::CheckValidity(arg.in_grid_desc_m_k_,
arg.out_grid_desc_m_k_);
}
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
}
static auto MakeArgument(const void* p_in, // input image
void* p_out, // output image
const ck::index_t N,
const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
const std::array<index_t, 2>& output_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads)
{
return Argument{static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out),
N,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
}
static auto MakeInvoker() { return Invoker{}; }
std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, // input image
void* p_out, // output image
const ck::index_t N,
const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
const std::array<index_t, 2>& output_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads) override
{
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out),
N,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
}
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>(Invoker{});
}
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "DeviceImageToColumn"
<< "<"
<< BlockSize << ", "
<< MPerBlock << ", "
<< KPerBlock << ", "
<< ScalarPerVector
<< ">";
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -24,7 +24,6 @@ template <typename InputGridDesc, ...@@ -24,7 +24,6 @@ template <typename InputGridDesc,
index_t BlockSize, index_t BlockSize,
index_t MPerBlock, index_t MPerBlock,
index_t KPerBlock, index_t KPerBlock,
typename SliceLengths,
typename ThreadClusterLengths, typename ThreadClusterLengths,
index_t ScalarPerVector, index_t ScalarPerVector,
typename Block2ETileMap> typename Block2ETileMap>
...@@ -65,7 +64,7 @@ struct GridwiseImageToColumn ...@@ -65,7 +64,7 @@ struct GridwiseImageToColumn
decltype(tie(out_grid_desc)), decltype(tie(out_grid_desc)),
tensor_operation::element_wise::PassThrough, tensor_operation::element_wise::PassThrough,
Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>, Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
SliceLengths, Sequence<MPerBlock, KPerBlock>,
ThreadClusterLengths, ThreadClusterLengths,
Sequence<0, 1>, Sequence<0, 1>,
Sequence<0, 1>, Sequence<0, 1>,
...@@ -83,7 +82,6 @@ struct GridwiseImageToColumn ...@@ -83,7 +82,6 @@ struct GridwiseImageToColumn
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf)); tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
} }
// template <typename... TsIn, typename... TsOut>
__host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc, __host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc,
const OutputGridDesc& out_grid_desc) const OutputGridDesc& out_grid_desc)
{ {
......
...@@ -116,15 +116,15 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -116,15 +116,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]) - static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]) -
static_cast<ck::long_index_t>(arg.in_left_pads_[0]); static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
for(index_t c = 0; c < C; ++c)
{
if(wi >= 0 && if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
{ {
for(index_t c = 0; c < C; ++c)
{
column++;
InDataType v_in = arg.input_(0, n, c, wi); InDataType v_in = arg.input_(0, n, c, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
} }
column++;
} }
} }
}; };
...@@ -154,18 +154,18 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -154,18 +154,18 @@ struct ReferenceImageToColumn : public device::BaseOperator
static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]) - static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]) -
static_cast<ck::long_index_t>(arg.in_left_pads_[1]); static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
for(index_t c = 0; c < C; ++c)
{
if(hi >= 0 && if(hi >= 0 &&
ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[3] && ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[3] &&
wi >= 0 && wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
{ {
for(index_t c = 0; c < C; ++c)
{
InDataType v_in = arg.input_(0, n, c, hi, wi); InDataType v_in = arg.input_(0, n, c, hi, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
column++;
} }
column++;
} }
} }
} }
...@@ -201,20 +201,23 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -201,20 +201,23 @@ struct ReferenceImageToColumn : public device::BaseOperator
static_cast<ck::long_index_t>(wo * arg.conv_strides_[2]) + static_cast<ck::long_index_t>(wo * arg.conv_strides_[2]) +
static_cast<ck::long_index_t>(x * arg.conv_dilations_[2]) - static_cast<ck::long_index_t>(x * arg.conv_dilations_[2]) -
static_cast<ck::long_index_t>(arg.in_left_pads_[2]); static_cast<ck::long_index_t>(arg.in_left_pads_[2]);
for(index_t c = 0; c < C; ++c)
{
if(di >= 0 && if(di >= 0 &&
ck::type_convert<std::size_t>(di) < arg.input_.GetLengths()[3] && ck::type_convert<std::size_t>(di) <
arg.input_.GetLengths()[3] &&
hi >= 0 && hi >= 0 &&
ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[4] && ck::type_convert<std::size_t>(hi) <
arg.input_.GetLengths()[4] &&
wi >= 0 && wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[5]) ck::type_convert<std::size_t>(wi) <
{ arg.input_.GetLengths()[5])
for(index_t c = 0; c < C; ++c)
{ {
InDataType v_in = arg.input_(0, n, c, di, hi, wi); InDataType v_in = arg.input_(0, n, c, di, hi, wi);
arg.output_(row, column) = arg.output_(row, column) =
ck::type_convert<OutDataType>(v_in); ck::type_convert<OutDataType>(v_in);
column++;
} }
column++;
} }
} }
} }
......
...@@ -24,44 +24,94 @@ using S = ck::Sequence<Is...>; ...@@ -24,44 +24,94 @@ using S = ck::Sequence<Is...>;
template <ck::index_t NDimSpatial, typename InLayout> template <ck::index_t NDimSpatial, typename InLayout>
using device_image_to_column_bf16_instances = std::tuple< using device_image_to_column_bf16_instances = std::tuple<
// clang-format off // clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Slice| Thread| Scalar| //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Lengths| Cluster| Per| //#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | | Lengths| Vector| //#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | | | //#####################| | | | | | | | | |
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 256, 128, 128, S<128, 128>, S<16, 16>, 8> DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 64, 8, 8, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 64, 16, 16, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 64, 32, 32, S<8, 8>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 64, 64, 64, S<8, 8>, 8>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 128, 16, 16, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 128, 64, 64, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 128, 32, 64, S<8, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 128, 64, 128, S<8, 16>, 8>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 256, 16, 16, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 256, 64, 64, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 256, 128, 128, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 256, 64, 64, S<16, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 256, 128, 128, S<16, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, BF16, BF16, 256, 128, 128, S<16, 16>, 8>
// clang-format on // clang-format on
>; >;
template <ck::index_t NDimSpatial, typename InLayout> template <ck::index_t NDimSpatial, typename InLayout>
using device_image_to_column_f16_instances = std::tuple< using device_image_to_column_f16_instances = std::tuple<
// clang-format off // clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Slice| Thread| Scalar| //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Lengths| Cluster| Per| //#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | | Lengths| Vector| //#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | | | //#####################| | | | | | | | | |
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 256, 128, 128, S<128, 128>, S<16, 16>, 8> DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 64, 8, 8, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 64, 16, 16, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 64, 32, 32, S<8, 8>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 64, 64, 64, S<8, 8>, 8>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 128, 16, 16, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 128, 64, 64, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 128, 32, 64, S<8, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 128, 64, 128, S<8, 16>, 8>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 256, 16, 16, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 256, 64, 64, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 256, 128, 128, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 256, 64, 64, S<16, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 256, 128, 128, S<16, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F16, F16, 256, 128, 128, S<16, 16>, 8>
// clang-format on // clang-format on
>; >;
template <ck::index_t NDimSpatial, typename InLayout> template <ck::index_t NDimSpatial, typename InLayout>
using device_image_to_column_f32_instances = std::tuple< using device_image_to_column_f32_instances = std::tuple<
// clang-format off // clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Slice| Thread| Scalar| //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Lengths| Cluster| Per| //#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | | Lengths| Vector| //#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | | | //#####################| | | | | | | | | |
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 256, 128, 128, S<128, 128>, S<16, 16>, 4> DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 64, 8, 8, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 64, 16, 16, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 64, 32, 32, S<8, 8>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 128, 16, 16, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 128, 64, 64, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 128, 32, 64, S<8, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 256, 16, 16, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 256, 64, 64, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 256, 128, 128, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 256, 64, 64, S<16, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, F32, F32, 256, 128, 128, S<16, 16>, 4>
// clang-format on // clang-format on
>; >;
template <ck::index_t NDimSpatial, typename InLayout> template <ck::index_t NDimSpatial, typename InLayout>
using device_image_to_column_i8_instances = std::tuple< using device_image_to_column_i8_instances = std::tuple<
// clang-format off // clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Slice| Thread| Scalar| //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Lengths| Cluster| Per| //#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | | Lengths| Vector| //#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | | | //#####################| | | | | | | | | |
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 256, 256, S<256, 256>, S<16, 16>, 16> DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 64, 8, 8, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 64, 16, 16, S<8, 8>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 64, 32, 32, S<8, 8>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 64, 64, 64, S<8, 8>, 8>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 128, 16, 16, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 128, 64, 64, S<8, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 128, 32, 64, S<8, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 128, 64, 128, S<8, 16>, 8>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 16, 16, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 64, 64, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 128, 128, S<16, 16>, 1>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 64, 64, S<16, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 128, 128, S<16, 16>, 4>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 128, 128, S<16, 16>, 8>,
DeviceImageToColumnImpl<NDimSpatial, InLayout, int8_t, int8_t, 256, 256, 256, S<16, 16>, 16>
// clang-format on // clang-format on
>; >;
......
add_instance_library(device_image_to_column_instance
device_image_to_column_nhwc_1d_instance.cpp
device_image_to_column_nhwc_2d_instance.cpp
device_image_to_column_nhwc_3d_instance.cpp
)
...@@ -184,3 +184,42 @@ tflops: 95.337 ...@@ -184,3 +184,42 @@ tflops: 95.337
GB/s: 69.2301 GB/s: 69.2301
``` ```
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time. Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
## Profile image to column kernels
```bash
# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
# arg2: data type (0: Input fp32, Weight fp32, Output fp32
# 1: Input fp16, Weight fp16, Output fp16
# 2: Input bf16, Weight bf16, Output bf16
# 3: Input int8, Weight int8, Output int8)
# arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
# arg4: verification (0: no, 1: yes)
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
# <strides>, (ie Sy, Sx for 2D)
# <dilations>, (ie Dy, Dx for 2D)
# <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D)
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler image_to_column 0 0 1 1 0 1 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0
```
Result (MI250, FP32, NHWC)
```
input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
output: dim 2, lengths {173056, 4608}, strides {4608, 1}
....
Best configuration parameters:
name: DeviceImageToColumn<256, 64, 64, 4>
avg_time: 3.19792
tflops: 0
GB/s: 1125.99
```
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include <limits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
namespace ck {
namespace profiler {
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
template <index_t NDimSpatial,
typename InputLayout,
typename InputDataType,
typename OutputDataType>
bool profile_image_to_column_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param)
{
const ck::index_t NDoHoWo =
conv_param.N_ *
ck::accumulate_n<ck::index_t>(
conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const ck::index_t CZYX =
conv_param.C_ *
ck::accumulate_n<ck::index_t>(
conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>(
conv_param);
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
std::array<ck::index_t, 2> output_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
copy(out_desc.GetStrides(), output_m_k_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
Tensor<InputDataType> input(in_desc);
Tensor<OutputDataType> host_output(out_desc);
Tensor<OutputDataType> device_output(out_desc);
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1: input.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-5, 5}); break;
default: input.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
}
DeviceMem in_device_buf(sizeof(InputDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutputDataType) * device_output.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(input.mData.data());
// run reference op
if(do_verification)
{
auto ref_image_to_column = ck::tensor_operation::host::
ReferenceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
auto ref_invoker = ref_image_to_column.MakeInvoker();
auto ref_argument = ref_image_to_column.MakeArgument(input,
host_output,
conv_param.filter_spatial_lengths_,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
// init host output to zero
host_output.SetZero();
ref_invoker.Run(ref_argument);
}
using DeviceOp = ck::tensor_operation::device::
DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
float best_avg_time = std::numeric_limits<float>::max();
;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device op instances
bool pass = true;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InputDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutputDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.N_,
conv_param.C_,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init output to zero before profiling next kernel
out_device_buf.SetZero();
std::string op_name = op_ptr->GetTypeString();
auto invoker_ptr = op_ptr->MakeInvokerPointer();
float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_btype =
conv_param.GetInputByte<InputDataType>() + NDoHoWo * CZYX * sizeof(OutputDataType);
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(avg_time < best_avg_time)
{
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log)
{
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
<< std::endl;
}
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
...@@ -25,6 +25,7 @@ set(PROFILER_SOURCES ...@@ -25,6 +25,7 @@ set(PROFILER_SOURCES
profile_contraction_bilinear.cpp profile_contraction_bilinear.cpp
profile_contraction_scale.cpp profile_contraction_scale.cpp
profile_grouped_conv_bwd_data.cpp profile_grouped_conv_bwd_data.cpp
profile_image_to_column.cpp
) )
if(DL_KERNELS) if(DL_KERNELS)
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
...@@ -76,6 +77,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_in ...@@ -76,6 +77,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_in
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
if(DL_KERNELS) if(DL_KERNELS)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
endif() endif()
......
...@@ -60,6 +60,7 @@ add_subdirectory(contraction) ...@@ -60,6 +60,7 @@ add_subdirectory(contraction)
add_subdirectory(pool_fwd) add_subdirectory(pool_fwd)
add_subdirectory(batched_gemm_multi_d) add_subdirectory(batched_gemm_multi_d)
add_subdirectory(grouped_convnd_bwd_data) add_subdirectory(grouped_convnd_bwd_data)
add_subdirectory(image_to_column)
if(GPU_TARGETS MATCHES "gfx11") if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op) add_subdirectory(wmma_op)
endif() endif()
add_gtest_executable(test_image_to_column test_image_to_column.cpp)
target_link_libraries(test_image_to_column PRIVATE utility device_image_to_column_instance)
add_gtest_executable(test_image_to_column_interface test_image_to_column_interface.cpp)
target_link_libraries(test_image_to_column_interface PRIVATE utility)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_image_to_column_impl.hpp"
template <typename Tuple>
class TestImageToColumn : public ::testing::Test
{
protected:
using InDataType = std::tuple_element_t<0, Tuple>;
using OutDataType = std::tuple_element_t<1, Tuple>;
using InLayout = std::tuple_element_t<2, Tuple>;
std::vector<ck::utils::conv::ConvParam> conv_params;
template <ck::index_t NDimSpatial>
void Run()
{
EXPECT_FALSE(conv_params.empty());
bool pass = true;
for(auto& param : conv_params)
{
pass = pass && ck::profiler::profile_image_to_column_impl<NDimSpatial,
InLayout,
InDataType,
OutDataType>(
true, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel
param);
}
EXPECT_TRUE(pass);
}
};
using namespace ck::tensor_layout::convolution;
using KernelTypes1d = ::testing::Types<std::tuple<float, float, GNWC>,
std::tuple<ck::bhalf_t, ck::bhalf_t, GNWC>,
std::tuple<ck::half_t, ck::half_t, GNWC>,
std::tuple<int8_t, int8_t, GNWC>>;
using KernelTypes2d = ::testing::Types<std::tuple<float, float, GNHWC>,
std::tuple<ck::bhalf_t, ck::bhalf_t, GNHWC>,
std::tuple<ck::half_t, ck::half_t, GNHWC>,
std::tuple<int8_t, int8_t, GNHWC>>;
using KernelTypes3d = ::testing::Types<std::tuple<float, float, GNDHWC>,
std::tuple<ck::bhalf_t, ck::bhalf_t, GNDHWC>,
std::tuple<ck::half_t, ck::half_t, GNDHWC>,
std::tuple<int8_t, int8_t, GNDHWC>>;
template <typename Tuple>
class TestImageToColumn1d : public TestImageToColumn<Tuple>
{
};
template <typename Tuple>
class TestImageToColumn2d : public TestImageToColumn<Tuple>
{
};
template <typename Tuple>
class TestImageToColumn3d : public TestImageToColumn<Tuple>
{
};
TYPED_TEST_SUITE(TestImageToColumn1d, KernelTypes1d);
TYPED_TEST_SUITE(TestImageToColumn2d, KernelTypes2d);
TYPED_TEST_SUITE(TestImageToColumn3d, KernelTypes3d);
TYPED_TEST(TestImageToColumn1d, Test1D)
{
this->conv_params.clear();
this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {2}, {1}, {0}, {0}});
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
// ScalarPerVector should be 1
this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
// stride != 1
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
// dilation != 1
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
this->template Run<1>();
}
TYPED_TEST(TestImageToColumn2d, Test2D)
{
this->conv_params.clear();
this->conv_params.push_back(
{2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back(
{2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->template Run<2>();
}
TYPED_TEST(TestImageToColumn3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->template Run<3>();
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include <gtest/gtest.h>
using DataType = float;
using InLayout = ck::tensor_layout::convolution::GNWC;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
template <ck::index_t ScalarPerVector, bool IsCPacked>
class TestImageToColumnInterface : public ::testing::Test
{
protected:
static constexpr ck::index_t NDimSpatial = 1;
// clang-format off
using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
< NDimSpatial, InLayout, DataType, DataType, 256, 128, 128, S<16, 16>,ScalarPerVector>;
// clang-format on
ck::utils::conv::ConvParam conv_param;
bool Run()
{
const auto N = conv_param.N_;
const auto C = conv_param.C_;
const auto FakeC =
conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
const ck::index_t NDoHoWo =
N *
ck::accumulate_n<ck::index_t>(
conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const ck::index_t CZYX =
C *
ck::accumulate_n<ck::index_t>(
conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
conv_param);
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
std::array<ck::index_t, 2> output_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
copy(out_desc.GetStrides(), output_m_k_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
auto img2col = DeviceImgToColInstance{};
auto argument = img2col.MakeArgument(nullptr,
nullptr,
N,
IsCPacked ? C : FakeC,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_g_n_c_wis_strides,
output_m_k_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
return img2col.IsSupportedArgument(argument);
}
};
class TestImageToColumnInterface1ScalarPerVector : public TestImageToColumnInterface<1, true>
{
};
class TestImageToColumnInterface4ScalarPerVector : public TestImageToColumnInterface<4, true>
{
};
class TestImageToColumnInterface4ScalarPerVectorFakeC : public TestImageToColumnInterface<4, false>
{
};
TEST_F(TestImageToColumnInterface1ScalarPerVector, X1ScalarPerVector)
{
// vector load C * X % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C * left_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C * right_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, right_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, left_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// vector load C % ScalarPerVector, dilation
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
}
TEST_F(TestImageToColumnInterface4ScalarPerVector, X4ScalarPerVector)
{
// vector load C * X % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C * left_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C * right_pad_x % ScalarPerVector
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, right_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, left_pad and stride
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// vector load C % ScalarPerVector, dilation
this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
is_supported = this->Run();
EXPECT_FALSE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
}
TEST_F(TestImageToColumnInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
{
// C = 3
this->conv_param = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
bool is_supported = this->Run();
EXPECT_FALSE(is_supported);
// C = 4
this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
is_supported = this->Run();
EXPECT_TRUE(is_supported);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment