Commit 1c62b47b authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent c9b52bce
#ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#define CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
namespace ck {
// B = merge(N, Ho, Wo)
template <index_t GridSize,
index_t BlockSize,
typename Float,
typename ColGlobalDesc,
typename ImgGlobalDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads,
index_t EPerBlock,
index_t BPerBlock,
typename BlockCopySubLengths_E_B,
typename BlockCopyClusterLengths_E_B,
typename BlockCopyThreadClusterArrangeOrder,
typename BlockCopySrcAccessOrder,
typename BlockCopyDstAccessOrder,
index_t BlockCopyDataPerAccess_B>
struct GridwiseCol2Im_eb_nchw
{
__device__ void Run(const Float* const __restrict__ p_col_global,
Float* const __restrict__ p_img_global) const
{
constexpr auto col_e_b_global_desc = ColGlobalDesc{};
constexpr auto img_n_c_hi_wi_global_desc = ImgGlobalDesc{};
constexpr index_t N = img_n_c_hi_wi_global_desc.GetLengths()[0];
constexpr index_t C = img_n_c_hi_wi_global_desc.GetLengths()[1];
constexpr index_t Hi = img_n_c_hi_wi_global_desc.GetLengths()[2];
constexpr index_t Wi = img_n_c_hi_wi_global_desc.GetLengths()[3];
constexpr index_t Ho = OutputSizes{}[0];
constexpr index_t Wo = OutputSizes{}[1];
constexpr index_t Y = FilterSizes{}[0];
constexpr index_t X = FilterSizes{}[1];
constexpr index_t ConvStrideH = ConvStrides{}[0];
constexpr index_t ConvStrideW = ConvStrides{}[1];
constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1];
constexpr index_t E = C * Y * X;
constexpr index_t B = N * Ho * Wo;
// sanity-check for vectorized memory load
static_assert((Wo == 1 || (ConvStrideW == 1 || BlockCopyDataPerAccess_B == 1)) &&
(X == 1 || ConvDilationW % BlockCopyDataPerAccess_B == 0),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated");
// divide block work by [E, B]
static_assert(E % EPerBlock == 0 && B % BPerBlock == 0,
"wrong! cannot divide work evenly among block");
constexpr index_t EBlockWork = E / EPerBlock;
constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<EBlockWork, BBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t e_block_data_on_global = block_work_id[Number<0>{}] * EPerBlock;
const index_t b_block_data_on_global = block_work_id[Number<1>{}] * BPerBlock;
// construct img_eb_global_desc
constexpr auto img_n_c_hip_wip_global_desc = transform_tensor_descriptor(
img_n_c_hi_wi_global_desc,
make_tuple(
PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
constexpr index_t Hip = img_n_c_hip_wip_global_desc.GetLengths()[2];
constexpr index_t Wip = img_n_c_hip_wip_global_desc.GetLengths()[3];
constexpr auto img_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
img_n_c_hip_wip_global_desc,
make_tuple(PassThrough<N>{},
PassThrough<C>{},
Embed<Hip, Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
constexpr auto img_e_b_global_desc = transform_tensor_descriptor(
img_n_c_y_ho_x_wo_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// blockwise atomic accumulation
auto blockwise_copy = BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(col_e_b_global_desc),
decltype(img_e_b_global_desc),
Sequence<EPerBlock, BPerBlock>,
BlockCopySubLengths_E_B,
BlockCopyClusterLengths_E_B,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_B,
BlockCopyDataPerAccess_B,
AddressSpace::Vgpr,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd>(
make_multi_index(e_block_data_on_global, b_block_data_on_global),
make_multi_index(e_block_data_on_global, b_block_data_on_global));
// blockwise copy
blockwise_copy.Run(p_col_global, p_img_global);
}
};
} // namespace ck
#endif
#ifndef CK_GRIDWISE_DYNAMIC_COL2IM_GEMMKGEMMN_NCHW_HPP
#define CK_GRIDWISE_DYNAMIC_COL2IM_GEMMKGEMMN_NCHW_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
namespace ck {
template <typename... In>
__host__ __device__ constexpr auto
map_img_into_col(const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const MultiIndex<2> out_sizes,
const MultiIndex<2> filter_sizes,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t Ho = out_sizes[I0];
const index_t Wo = out_sizes[I1];
const index_t Y = filter_sizes[I0];
const index_t X = filter_sizes[I1];
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(
DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemmk_gemmn_global_desc;
}
template <index_t BlockSize,
index_t GemmKPerBlock,
index_t GemmNPerBlock,
typename BlockCopySubLengths_GemmK_GemmN,
typename BlockCopyClusterLengths_GemmK_GemmN,
typename BlockCopyThreadClusterArrangeOrder,
typename BlockCopySrcAccessOrder,
typename BlockCopyDstAccessOrder,
index_t BlockCopyDataPerAccess_GemmN>
struct GridwiseDynamicCol2Im_gemmkgemmn_nchw
{
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <typename... Col, typename... Img>
__device__ void Run_r1(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t GemmK = col_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = col_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
auto blockwise_copy =
#if 1
BlockwiseDynamicTensorSliceTransfer_v1r1<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>
#elif 1
BlockwiseDynamicTensorSliceTransfer_v2r1<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>
#endif
(col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(p_col_global, p_img_global);
blockwise_copy.MoveSrcSliceWindow(make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
}
}
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template <typename... Col, typename... Img>
__device__ void Run_r2(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t GemmK = col_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = col_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
auto blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v2r2<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>(
col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(col_gemmk_gemmn_global_desc,
p_col_global,
img_gemmk_gemmn_global_desc,
p_img_global);
blockwise_copy.MoveSrcSliceWindow(col_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(img_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
}
}
template <typename... Col, typename... Img>
__device__ void Run(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
Run_r2(
p_col_global, p_img_global, col_gemmk_gemmn_global_desc, img_gemmk_gemmn_global_desc);
}
};
} // namespace ck
#endif
#ifndef CK_DYNAMIC_TENSOR_COORDINATE_V1_HPP
#define CK_DYNAMIC_TENSOR_COORDINATE_V1_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor_v1.hpp"
namespace ck {
// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
// At the bare minimun, user should be able to query the following information from a tensor
// coordinate:
// 1. Tensor descriptor
// 2. Location, represented in the form of multi-index
// 3. Location, represented in the form of the offset to the origin of the tensor
// 4. If the location is inside invalid area or not, e.g. the padding area of an implicitly padded
// tensor is considered invalid, because the padding area doesn't have any physical memory
// allocation
// A tensor cooridnate also provides following functionality:
// 1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
// can freely move the "point of location" inside the tensor
// wrapper class for DynamicNativeTensorCoordinate_v1 and DynamicTransformedTensorCoordinate_v1
template <typename TensorDesc>
struct DynamicTensorCoordinate_v1;
// tensor coordinate for native tensor
template <typename TensorDesc>
struct DynamicNativeTensorCoordinate_v1
{
using type = DynamicNativeTensorCoordinate_v1;
using tensor_desc_type = TensorDesc;
static constexpr index_t NDim = tensor_desc_type::GetNumOfDimension();
using Index = MultiIndex<NDim>;
__host__
__device__ constexpr DynamicNativeTensorCoordinate_v1(const tensor_desc_type& tensor_desc,
const Index& idx)
: tensor_desc_{tensor_desc}, idx_{idx}, offset_{tensor_desc.CalculateOffset(idx)}
{
}
__host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_; }
__host__ __device__ constexpr const auto& GetUpperIndex() const { return idx_; }
__host__ __device__ constexpr const auto& GetIndex() const { return idx_; }
__host__ __device__ constexpr const index_t& GetOffset() const { return offset_; }
__host__ __device__ constexpr type operator+=(const Index& idx_diff)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_ += idx_diff;
offset_ += tensor_desc_.CalculateOffsetDiff(idx_diff);
return *this;
}
__host__ __device__ constexpr type operator-=(const Index& idx_diff)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_ -= idx_diff;
offset_ -= tensor_desc_.CalculateOffsetDiff(idx_diff);
return *this;
}
__host__ __device__ constexpr type operator+(const Index& idx_diff) const
{
type coord = *this;
coord += idx_diff;
return coord;
}
__host__ __device__ constexpr type operator-(const Index& idx_diff) const
{
type coord = *this;
coord -= idx_diff;
return coord;
}
__host__ __device__ constexpr index_t CalculateOffsetDiff(const Index& idx_diff) const
{
return tensor_desc_.CalculateOffsetDiff(idx_diff);
}
// evaluated at run-time
__host__ __device__ constexpr bool IsUpperIndexValid() const
{
return tensor_desc_.IsUpperIndexValid(idx_);
}
// evaluated at run-time
__host__ __device__ constexpr bool IsOffsetValid() const
{
// For native tensor, offset is valid if upper-index is valid
return IsUpperIndexValid();
}
// evaluated at compile-time
__host__ __device__ static constexpr bool IsOffsetValidAssumingUpperIndexIsValid()
{
return true;
}
private:
const tensor_desc_type tensor_desc_;
// idx_ may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index idx_;
index_t offset_;
};
// tensor coordinate for transformed tensor
template <typename TensorDesc>
struct DynamicTransformedTensorCoordinate_v1
{
static constexpr index_t NDimUp = TensorDesc::GetNumOfDimension();
using UpperDesc = TensorDesc;
using UpperCoord = DynamicTransformedTensorCoordinate_v1;
using UpperIndex = MultiIndex<NDimUp>;
using LowerDesc = typename UpperDesc::LowerDesc;
using LowerCoord = typename DynamicTensorCoordinate_v1<LowerDesc>::type;
__host__
__device__ constexpr DynamicTransformedTensorCoordinate_v1(const UpperDesc& tensor_desc_up,
const UpperIndex& idx_up)
: tensor_desc_up_{tensor_desc_up},
idx_up_{idx_up},
coord_low_{tensor_desc_up.GetLowerTensorDescriptor(),
tensor_desc_up.CalculateLowerIndex(idx_up)}
{
}
__host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_up_; }
__host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const
{
return coord_low_;
}
__host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return idx_up_; }
__host__ __device__ constexpr const UpperIndex& GetIndex() const { return idx_up_; }
__host__ __device__ constexpr const index_t& GetOffset() const
{
return GetLowerCoordinate().GetOffset();
}
__host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
coord_low_ += tensor_desc_up_.CalculateLowerIndexDiff(
idx_up_diff, GetLowerCoordinate().GetIndex(), GetIndex());
// idx_up_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_ += idx_up_diff;
return *this;
}
__host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
{
coord_low_ -= tensor_desc_up_.CalculateLowerIndexDiff(
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_ -= idx_up_diff;
return *this;
}
__host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
{
UpperCoord coord_up = *this;
coord_up += idx_up_diff;
return coord_up;
}
__host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
{
UpperCoord coord_up = *this;
coord_up -= idx_up_diff;
return coord_up;
}
// Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time.
// TODO: this function is not compiled to expected ISA
__host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
const auto idx_low_diff =
tensor_desc_up_.CalculateLowerIndexDiff(idx_up_diff, coord_low_.GetIndex(), idx_up_);
return coord_low_.CalculateOffsetDiff(idx_low_diff);
}
// evaluated at run-time
__host__ __device__ constexpr bool IsUpperIndexValid() const
{
return tensor_desc_up_.IsUpperIndexValid(idx_up_);
}
// evaluted at run-time
__host__ __device__ constexpr bool IsOffsetValid() const
{
return IsUpperIndexValid() && coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
}
// most evaluatation is done at comile-time
__host__ __device__ constexpr bool IsOffsetValidAssumingUpperIndexIsValid() const
{
return tensor_desc_up_.IsValidUpperIndexMappedToValidLowerIndex(idx_up_) &&
coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
}
private:
const UpperDesc tensor_desc_up_;
// idx_up_ may be calculated and updated, however, the value of some (or all) of its entries
// may never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
UpperIndex idx_up_;
LowerCoord coord_low_;
};
template <index_t NDim>
__host__ __device__ constexpr auto
make_dynamic_tensor_coordinate_v1(const DynamicNativeTensorDescriptor_v1<NDim>& tensor_desc,
const MultiIndex<NDim>& idx)
{
return DynamicNativeTensorCoordinate_v1<DynamicNativeTensorDescriptor_v1<NDim>>{tensor_desc,
idx};
}
template <index_t NDim, typename... Ts>
__host__ __device__ constexpr auto
make_dynamic_tensor_coordinate_v1(const DynamicTransformedTensorDescriptor_v1<Ts...>& tensor_desc,
const MultiIndex<NDim>& idx)
{
static_assert(DynamicTransformedTensorDescriptor_v1<Ts...>::GetNumOfDimension() == NDim,
"wrong! inconsistent # of dimensions");
return DynamicTransformedTensorCoordinate_v1<DynamicTransformedTensorDescriptor_v1<Ts...>>{
tensor_desc, idx};
}
template <typename TensorDesc>
struct DynamicTensorCoordinate_v1
{
static constexpr index_t NDim = TensorDesc::GetNumOfDimension();
using type =
decltype(make_dynamic_tensor_coordinate_v1<NDim>(TensorDesc{}, MultiIndex<NDim>{}));
};
} // namespace ck
#endif
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V1_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V1_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor_v1.hpp"
namespace ck {
template <typename Lengths, typename Strides>
__host__ __device__ constexpr auto make_dynamic_naive_tensor_descriptor_v1(const Lengths& lengths,
const Strides& strides)
{
static_assert(Lengths::Size() == Strides::Size(), "wrong! Size not the same");
return DynamicNativeTensorDescriptor_v1<Lengths::Size()>(lengths, strides);
}
template <typename LowTensorDescriptor,
typename Transforms,
typename LowDimensionIds,
typename UpDimensionIds>
__host__ __device__ constexpr auto
transform_dynamic_tensor_descriptor_v1(const LowTensorDescriptor& low_tensor_desc,
const Transforms& transforms,
LowDimensionIds,
UpDimensionIds)
{
return DynamicTransformedTensorDescriptor_v1<LowTensorDescriptor,
Transforms,
LowDimensionIds,
UpDimensionIds>{low_tensor_desc, transforms};
}
} // namespace ck
#endif
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_V1_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_V1_HPP
#include "common_header.hpp"
#include "dynamic_multi_index_transform.hpp"
namespace ck {
template <index_t NDim>
struct DynamicNativeTensorDescriptor_v1
{
using Index = MultiIndex<NDim>;
const Index lengths_;
const Index strides_;
__host__ __device__ constexpr DynamicNativeTensorDescriptor_v1(const Index& lengths,
const Index& strides)
: lengths_{lengths}, strides_{strides}
{
}
__host__ __device__ constexpr DynamicNativeTensorDescriptor_v1()
: lengths_{make_zero_multi_index<NDim>()}, strides_{make_zero_multi_index<NDim>()}
{
}
__host__ __device__ static constexpr index_t GetNumOfDimension() { return NDim; }
__host__ __device__ constexpr auto GetLengths() const { return lengths_; }
__host__ __device__ constexpr auto GetStrides() const { return strides_; }
template <index_t IDim>
__host__ __device__ constexpr index_t GetLength(Number<IDim>) const
{
return lengths_[Number<IDim>{}];
}
template <index_t IDim>
__host__ __device__ constexpr index_t GetStride(Number<IDim>) const
{
return strides_[Number<IDim>{}];
}
__host__ __device__ constexpr index_t GetElementSize() const
{
return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
{
index_t space = 1;
static_for<0, NDim, 1>{}([&](auto i) { space += (GetLength(i) - 1) * GetStride(i); });
return space;
}
template <typename Idx>
__host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
{
index_t offset = 0;
static_for<0, NDim, 1>{}([&](auto i) { offset += idx[i] * GetStride(i); });
return offset;
}
template <typename IdxDiff>
__host__ __device__ constexpr index_t CalculateOffsetDiff(const IdxDiff& idx_diff) const
{
return CalculateOffset(idx_diff);
}
template <typename Idx>
__host__ __device__ constexpr bool IsUpperIndexValid(const Idx& idx) const
{
bool flag = true;
static_for<0, NDim, 1>{}(
[&](auto i) { flag = flag && idx[i] >= 0 && idx[i] < GetLength(i); });
return flag;
}
};
template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor_v1 or
// DynamicTransformedTensorDescriptor_v1
typename Transforms, // Tuple<MultIndexTransforms...>
typename LowDimensionIds, // Tuple<Sequence<...>>
typename UpDimensionIds> // Tuple<Sequence<...>>
struct DynamicTransformedTensorDescriptor_v1
{
using LowerDesc = LowTensorDescriptor;
using UpperDesc = DynamicTransformedTensorDescriptor_v1;
static constexpr index_t NTransform = Transforms::Size();
const LowerDesc low_tensor_desc_;
const Transforms transforms_;
__host__ __device__ static constexpr index_t GetNumOfLowerDimension()
{
return LowerDesc::GetNumOfDimension();
}
__host__ __device__ static constexpr index_t GetNumOfUpperDimension()
{
index_t ndim_up = 0;
static_for<0, NTransform, 1>{}([&](auto i) constexpr {
constexpr auto tmp = UpDimensionIds{}.At(i);
ndim_up += decltype(tmp)::Size();
});
return ndim_up;
}
static constexpr index_t NDimUp = GetNumOfUpperDimension();
static constexpr index_t NDimLow = GetNumOfLowerDimension();
using UpperIndex = MultiIndex<NDimUp>;
using LowerIndex = MultiIndex<NDimLow>;
struct lambda_merge_sequences
{
template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const
{
return merge_sequences(xs...);
}
};
struct lambda_merge_arrays
{
template <typename... Xs>
__host__ __device__ constexpr auto operator()(Xs... xs) const
{
return container_cat(xs...);
}
};
__host__
__device__ constexpr DynamicTransformedTensorDescriptor_v1(const LowerDesc& low_tensor_desc,
const Transforms& transforms)
: low_tensor_desc_{low_tensor_desc}, transforms_{transforms}
{
static_assert(NTransform == Transforms::Size() && NTransform == LowDimensionIds::Size() &&
NTransform == UpDimensionIds::Size(),
"wrong! # of transformations not the same");
// sanity check:
// LowDimensionIds should include all low-dimensions,
// UpDimensionIds should include all up-dimensions
using unsorted_up_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
using sorted_up_dimension_ids =
typename sequence_sort<unsorted_up_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_up_dimension_ids::Size() == NDimUp &&
is_valid_sequence_map<sorted_up_dimension_ids>{},
"wrong! UpDimensionIds is not configured correctly");
using unsorted_low_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
using sorted_low_dimension_ids =
typename sequence_sort<unsorted_low_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_low_dimension_ids::Size() == NDimLow &&
is_valid_sequence_map<sorted_low_dimension_ids>{},
"wrong! LowDimensionIds is not configured correctly");
// TODO: sanity check: while a up-dimension could be associated with
// multille
// transformation, a low-dimension should be associated with only one
// transformation
// TODO: sanity-check: GetLowerLengths of each transform should be
// consistent with lengths
// of lower-tensor-descriptor
}
__host__ __device__ constexpr DynamicTransformedTensorDescriptor_v1()
: low_tensor_desc_{}, transforms_{}
{
}
__host__ __device__ static constexpr index_t GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ constexpr auto GetUpperLengths() const
{
// sort upper-dimension-ids
constexpr auto unsorted_up_dimension_ids =
unpack(lambda_merge_sequences{}, UpDimensionIds{});
using sort_up_dimension_ids = sequence_unique_sort<decltype(unsorted_up_dimension_ids),
math::less<index_t>,
math::equal<index_t>>;
constexpr auto sorted2unsorted_map = typename sort_up_dimension_ids::sorted2unsorted_map{};
// sort upper-lengths
const auto tuple_of_up_lengths =
transform_tuples([](const auto& tran) constexpr { return tran.GetUpperLengths(); },
transforms_);
const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);
const auto sorted_up_lengths =
container_reorder_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
return sorted_up_lengths;
}
__host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
template <index_t IDim>
__host__ __device__ constexpr index_t GetLength(Number<IDim>) const
{
return GetLengths()[Number<IDim>{}];
}
__host__ __device__ constexpr index_t GetElementSize() const
{
return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpace() const
{
return low_tensor_desc_.GetElementSpace();
}
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const { return low_tensor_desc_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{
static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
const auto tran = transforms_.At(itran);
const auto idx_up_part = pick_container_element(idx_up, UpDimensionIds{}.At(itran));
auto idx_low_part = pick_container_element(idx_low, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndex(idx_low_part, idx_up_part);
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
const UpIdxDiff& idx_up_diff,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old) const
{
static_for<0, NTransform, 1>{}([&](auto itran) {
const auto tran = transforms_.At(itran);
const auto idx_up_diff_part =
pick_container_element(idx_up_diff, UpDimensionIds{}.At(itran));
const auto idx_up_old_part =
pick_container_element(idx_up_old, UpDimensionIds{}.At(itran));
const auto idx_low_old_part =
pick_container_element(idx_low_old, LowDimensionIds{}.At(itran));
auto idx_low_diff_part =
pick_container_element(idx_low_diff, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndexDiff(
idx_low_diff_part, idx_up_diff_part, idx_low_old_part, idx_up_old_part);
});
}
template <typename UpIdx>
__host__ __device__ constexpr auto CalculateLowerIndex(const UpIdx& idx_up) const
{
LowerIndex idx_low;
CalculateLowerIndex(idx_low, idx_up);
return idx_low;
}
template <typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr auto CalculateLowerIndexDiff(const UpIdxDiff& idx_up_diff,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old) const
{
LowerIndex idx_low_diff;
CalculateLowerIndexDiff(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
return idx_low_diff;
}
__host__ __device__ constexpr index_t CalculateOffset(const UpperIndex& idx_up) const
{
return low_tensor_desc_.CalculateOffset(CalculateLowerIndex(idx_up));
}
__host__ __device__ constexpr bool IsUpperIndexValid(const UpperIndex& idx_up) const
{
bool flag = true;
static_for<0, NDimUp, 1>{}(
[&](auto i) { flag = flag && idx_up[i] >= 0 && idx_up[i] < GetLength(i); });
return flag;
}
__host__ __device__ constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
{
bool flag = true;
static_for<0, NTransform, 1>{}([&](auto itran) {
const auto tran = Transforms{}.At(itran);
// check a indtransformation if it does not always has a valid mapping
constexpr bool is_valid_up_always_mapped_to_valid_low =
decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex();
if constexpr(!is_valid_up_always_mapped_to_valid_low)
{
const auto up_dims_part = UpDimensionIds{}.At(itran);
const auto idx_up_part = pick_container_element(idx_up, up_dims_part);
flag = flag && tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
}
});
return flag;
}
};
} // namespace ck
#endif
......@@ -17,17 +17,13 @@ install(TARGETS host LIBRARY DESTINATION lib)
if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE src/conv_driver.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
set(COL2IM_SOURCE src/col2im_driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE src/conv_driver.cu)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
set(COL2IM_SOURCE src/col2im_driver.cu)
endif()
add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
add_executable(col2im_driver ${COL2IM_SOURCE})
target_link_libraries(conv_driver PRIVATE host)
target_link_libraries(conv_bwd_data_driver PRIVATE host)
target_link_libraries(col2im_driver PRIVATE host)
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_col2im_eb_nchw.hpp"
template <typename T,
typename ColDesc,
typename ImgDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_col2im_eb_nchw(ColDesc,
const Tensor<T>& col_eb,
ImgDesc,
Tensor<T>& img_nchw,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr auto col_eb_desc = ColDesc{};
constexpr auto img_nchw_desc = ImgDesc{};
constexpr index_t N = img_nchw_desc.GetLengths()[0];
constexpr index_t C = img_nchw_desc.GetLengths()[1];
constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
constexpr index_t E = col_eb_desc.GetLengths()[0];
constexpr index_t B = col_eb_desc.GetLengths()[1];
std::size_t data_sz = sizeof(T);
DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
col_eb_device_buf.ToDevice(col_eb.mData.data());
img_nchw_device_buf.ToDevice(img_nchw.mData.data());
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t EPerBlock = 128;
constexpr index_t BPerBlock = 128;
using BlockCopySubLengths_E_B = Sequence<8, 8>;
using BlockCopyClusterLengths_E_B = Sequence<16, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t BlockCopyDataPerAccess_B = 1;
#endif
constexpr index_t GridSize =
((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
BlockSize,
T,
ColDesc,
ImgDesc,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
EPerBlock,
BPerBlock,
BlockCopySubLengths_E_B,
BlockCopyClusterLengths_E_B,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_B>{};
for(index_t i = 0; i < 1; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
const T* const __restrict__,
T* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
const_cast<const T* const __restrict__>(
static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
std::cout << "Average time : " << ave_time << " ms" << std::endl;
}
img_nchw_device_buf.FromDevice(img_nchw.mData.data());
}
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp"
template <typename T,
typename ColDesc,
typename ImgDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc,
const Tensor<T>& col_gemmk_gemmn,
ImgDesc,
Tensor<T>& img_n_c_hi_wi,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
std::size_t data_sz = sizeof(T);
DeviceMem col_gemmk_gemmn_device_buf(data_sz * col_gemmk_gemmn.mDesc.GetElementSpace());
DeviceMem img_n_c_hi_wi_device_buf(data_sz * img_n_c_hi_wi.mDesc.GetElementSpace());
col_gemmk_gemmn_device_buf.ToDevice(col_gemmk_gemmn.mData.data());
img_n_c_hi_wi_device_buf.ToDevice(img_n_c_hi_wi.mData.data());
const auto col_gemmk_gemmn_desc = make_dynamic_naive_tensor_descriptor<2>(
to_multi_index(ColDesc::GetLengths()), to_multi_index(ColDesc::GetStrides()));
const auto img_n_c_hi_wi_desc = make_dynamic_naive_tensor_descriptor<4>(
to_multi_index(ImgDesc::GetLengths()), to_multi_index(ImgDesc::GetStrides()));
const auto filter_sizes = to_multi_index(FilterSizes{});
const auto out_sizes = to_multi_index(OutputSizes{});
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
const auto img_gemmk_gemmn_desc = map_img_into_col(img_n_c_hi_wi_desc,
out_sizes,
filter_sizes,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const index_t GemmN = col_gemmk_gemmn_desc.GetLength(I1);
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmNPerBlock = 128;
using BlockCopySubLengths_GemmK_GemmN = Sequence<1, 8>;
using BlockCopyClusterLengths_GemmK_GemmN = Sequence<8, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
constexpr index_t BlockCopyDataPerAccess_GemmN = 1;
#endif
const index_t GridSize = GemmN / GemmNPerBlock;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_col2im =
GridwiseDynamicCol2Im_gemmkgemmn_nchw<BlockSize,
GemmKPerBlock,
GemmNPerBlock,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_GemmN>{};
for(index_t i = 0; i < 1; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
const T* const __restrict__,
T* const __restrict__,
decltype(col_gemmk_gemmn_desc),
decltype(img_gemmk_gemmn_desc)>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
const_cast<const T* const __restrict__>(
static_cast<T*>(col_gemmk_gemmn_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(img_n_c_hi_wi_device_buf.GetDeviceBuffer())),
col_gemmk_gemmn_desc,
img_gemmk_gemmn_desc);
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
std::cout << "Average time : " << ave_time << " ms" << std::endl;
}
img_n_c_hi_wi_device_buf.FromDevice(img_n_c_hi_wi.mData.data());
}
#pragma once
#include "host_tensor.hpp"
template <typename T,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void host_col2im(const Tensor<T>& in_eb,
Tensor<T>& in_nchw,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads)
{
using namespace ck;
int N = in_nchw.mDesc.GetLengths()[0];
int C = in_nchw.mDesc.GetLengths()[1];
int Hi = in_nchw.mDesc.GetLengths()[2];
int Wi = in_nchw.mDesc.GetLengths()[3];
int Y = FilterSizes{}[0];
int X = FilterSizes{}[1];
int Ho = OutputSizes{}[0];
int Wo = OutputSizes{}[1];
auto f = [&](auto n, auto c, auto hi, auto wi) {
double v = 0;
for(int y = 0; y < Y; ++y)
{
int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
if(h_tmp % ConvStrides{}[0] == 0)
{
int ho = h_tmp / ConvStrides{}[0];
if(ho >= 0 && ho < Ho)
{
for(int x = 0; x < X; ++x)
{
int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
if(w_tmp % ConvStrides{}[1] == 0)
{
int wo = w_tmp / ConvStrides{}[1];
if(wo >= 0 && wo < Wo && w_tmp % ConvStrides{}[1] == 0)
{
int e = c * (Y * X) + y * X + x;
int b = n * (Ho * Wo) + ho * Wo + wo;
v += in_eb(e, b);
}
}
}
}
}
}
in_nchw(n, c, hi, wi) = v;
};
auto f_par = make_ParallelTensorFunctor(f,
in_nchw.mDesc.GetLengths()[0],
in_nchw.mDesc.GetLengths()[1],
in_nchw.mDesc.GetLengths()[2],
in_nchw.mDesc.GetLengths()[3]);
f_par(std::thread::hardware_concurrency());
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "host_col2im.hpp"
#include "device_col2im_eb_nchw.hpp"
#include "device_dynamic_col2im_gemmkgemmn_nchw.hpp"
int main(int argc, char* argv[])
{
using namespace ck;
#if 1
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 71x71
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 71;
constexpr index_t WI = 71;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 1
// 1x1, 8x8
constexpr index_t N = 128;
constexpr index_t C = 1536;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 73x73
constexpr index_t N = 128;
constexpr index_t C = 160;
constexpr index_t HI = 73;
constexpr index_t WI = 73;
constexpr index_t K = 64;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 35x35
constexpr index_t N = 128;
constexpr index_t C = 96;
constexpr index_t HI = 35;
constexpr index_t WI = 35;
constexpr index_t K = 96;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 71x71
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 71;
constexpr index_t WI = 71;
constexpr index_t K = 192;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 7x1, 17x17
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 7;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>;
#elif 1
// 1x7, 17x17
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 7;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 3>;
using RightPads = Sequence<0, 3>;
#elif 0
// 3x3, 299x299 stride=2
constexpr index_t N = 128;
constexpr index_t C = 3;
constexpr index_t HI = 299;
constexpr index_t WI = 299;
constexpr index_t K = 32;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 147x147
constexpr index_t N = 128;
constexpr index_t C = 32;
constexpr index_t HI = 147;
constexpr index_t WI = 147;
constexpr index_t K = 64;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 149x149
constexpr index_t N = 128;
constexpr index_t C = 32;
constexpr index_t HI = 149;
constexpr index_t WI = 149;
constexpr index_t K = 32;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 17x17, stride 2
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 192;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 35x35
constexpr index_t N = 128;
constexpr index_t C = 384;
constexpr index_t HI = 35;
constexpr index_t WI = 35;
constexpr index_t K = 96;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 35x35, stride 2
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 35;
constexpr index_t WI = 35;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x3, 8x8
constexpr index_t N = 128;
constexpr index_t C = 384;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 448;
constexpr index_t Y = 1;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 1>;
using RightPads = Sequence<0, 1>;
#elif 0
// 3x1, 8x8
constexpr index_t N = 128;
constexpr index_t C = 448;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 512;
constexpr index_t Y = 3;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 0>;
using RightPads = Sequence<1, 0>;
#elif 0
// 3x3, 147x147
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 147;
constexpr index_t WI = 147;
constexpr index_t K = 96;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 7x1, 73x73
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 73;
constexpr index_t WI = 73;
constexpr index_t K = 64;
constexpr index_t Y = 7;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>;
#elif 0
// 3x3, 73x73
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 73;
constexpr index_t WI = 73;
constexpr index_t K = 96;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 14x14, stride 2
constexpr index_t N = 128;
constexpr index_t C = 1024;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 2048;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 14x14
constexpr index_t N = 128;
constexpr index_t C = 1024;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 14x14, stride 2
constexpr index_t N = 128;
constexpr index_t C = 1024;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 512;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
// 3x3, 28x28
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 14x14
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 256;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 1x1, 56x56, stride 2
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 56;
constexpr index_t WI = 56;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 7x7, 230x230 stride=2
constexpr index_t N = 128;
constexpr index_t C = 3;
constexpr index_t HI = 230;
constexpr index_t WI = 230;
constexpr index_t K = 64;
constexpr index_t Y = 7;
constexpr index_t X = 7;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 28x28, stride = 2
constexpr index_t N = 128;
constexpr index_t C = 512;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 1024;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 28x28, stride 2
constexpr index_t N = 128;
constexpr index_t C = 512;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 7x7
constexpr index_t N = 128;
constexpr index_t C = 512;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 2048;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 7x7
constexpr index_t N = 128;
constexpr index_t C = 512;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 512;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 1x1, 56x56
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 56;
constexpr index_t WI = 56;
constexpr index_t K = 64;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 56x56
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 56;
constexpr index_t WI = 56;
constexpr index_t K = 64;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#endif
constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
constexpr index_t WO = out_nkhw_desc.GetLengths()[3];
constexpr auto col_eb_desc =
make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
using FilterSizes = Sequence<Y, X>;
using OutputSizes = Sequence<HO, WO>;
ostream_tensor_descriptor(col_eb_desc, std::cout << "col_eb_desc: ");
ostream_tensor_descriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
print_array("FilterSizes", FilterSizes{});
print_array("OutputSizes", OutputSizes{});
print_array("LeftPads", LeftPads{});
print_array("LeftPads", LeftPads{});
print_array("RightPads", RightPads{});
print_array("ConvStrides", ConvStrides{});
print_array("ConvDilations", ConvDilations{});
Tensor<float> col_eb(make_HostTensorDescriptor(col_eb_desc));
Tensor<float> img_nchw_host(make_HostTensorDescriptor(img_nchw_desc));
Tensor<float> img_nchw_device(make_HostTensorDescriptor(img_nchw_desc));
std::size_t num_thread = std::thread::hardware_concurrency();
if(argc != 3)
{
printf("arg1: do_verification, arg2: nrepeat\n");
exit(1);
}
bool do_verification = atoi(argv[1]);
std::size_t nrepeat = atoi(argv[2]);
if(do_verification)
{
#if 0
col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else
col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#endif
}
#if 0
device_col2im_eb_nchw(col_eb_desc,
col_eb,
img_nchw_desc,
img_nchw_device,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1
device_dynamic_col2im_gemmkgemmn_nchw(col_eb_desc,
col_eb,
img_nchw_desc,
img_nchw_device,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif
if(do_verification)
{
host_col2im(col_eb,
img_nchw_host,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{});
check_error(img_nchw_host, img_nchw_device);
#if 0
LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
#endif
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment