Commit 711701f3 authored by Chao Liu's avatar Chao Liu
Browse files

adding dynamic copy

parent 176d9fc0
#ifndef CK_DYNAMIC_GRIDWISE_COL2IM_EB_NCHW_HPP
#define CK_DYNAMIC_GRIDWISE_COL2IM_EB_NCHW_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
namespace ck {
// B = merge(N, Ho, Wo)
template <index_t GridSize,
index_t BlockSize,
typename Float,
typename ColGlobalDesc,
typename ImgGlobalDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads,
index_t EPerBlock,
index_t BPerBlock,
typename BlockCopySubLengths_E_B,
typename BlockCopyClusterLengths_E_B,
typename BlockCopyThreadClusterArrangeOrder,
typename BlockCopySrcAccessOrder,
typename BlockCopyDstAccessOrder,
index_t BlockCopyDataPerAccess_B>
struct DynamicGridwiseCol2Im_eb_nchw
{
__device__ void Run(const Float* const __restrict__ p_col_global,
Float* const __restrict__ p_img_global) const
{
constexpr auto col_e_b_global_desc = ColGlobalDesc{};
constexpr auto img_n_c_hi_wi_global_desc = ImgGlobalDesc{};
constexpr index_t N = img_n_c_hi_wi_global_desc.GetLengths()[0];
constexpr index_t C = img_n_c_hi_wi_global_desc.GetLengths()[1];
constexpr index_t Hi = img_n_c_hi_wi_global_desc.GetLengths()[2];
constexpr index_t Wi = img_n_c_hi_wi_global_desc.GetLengths()[3];
constexpr index_t Ho = OutputSizes{}[0];
constexpr index_t Wo = OutputSizes{}[1];
constexpr index_t Y = FilterSizes{}[0];
constexpr index_t X = FilterSizes{}[1];
constexpr index_t ConvStrideH = ConvStrides{}[0];
constexpr index_t ConvStrideW = ConvStrides{}[1];
constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1];
constexpr index_t E = C * Y * X;
constexpr index_t B = N * Ho * Wo;
// sanity-check for vectorized memory load
static_assert((Wo == 1 || (ConvStrideW == 1 || BlockCopyDataPerAccess_B == 1)) &&
(X == 1 || ConvDilationW % BlockCopyDataPerAccess_B == 0),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated");
// divide block work by [E, B]
static_assert(E % EPerBlock == 0 && B % BPerBlock == 0,
"wrong! cannot divide work evenly among block");
constexpr index_t EBlockWork = E / EPerBlock;
constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<EBlockWork, BBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t e_block_data_on_global = block_work_id[Number<0>{}] * EPerBlock;
const index_t b_block_data_on_global = block_work_id[Number<1>{}] * BPerBlock;
// construct img_eb_global_desc
constexpr auto img_n_c_hip_wip_global_desc = transform_tensor_descriptor(
img_n_c_hi_wi_global_desc,
make_tuple(
PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
constexpr index_t Hip = img_n_c_hip_wip_global_desc.GetLengths()[2];
constexpr index_t Wip = img_n_c_hip_wip_global_desc.GetLengths()[3];
constexpr auto img_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
img_n_c_hip_wip_global_desc,
make_tuple(PassThrough<N>{},
PassThrough<C>{},
Embed<Hip, Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
constexpr auto img_e_b_global_desc = transform_tensor_descriptor(
img_n_c_y_ho_x_wo_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// blockwise atomic accumulation
auto blockwise_copy = BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(col_e_b_global_desc),
decltype(img_e_b_global_desc),
Sequence<EPerBlock, BPerBlock>,
BlockCopySubLengths_E_B,
BlockCopyClusterLengths_E_B,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_B,
BlockCopyDataPerAccess_B,
AddressSpace::Vgpr,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd>(
make_multi_index(e_block_data_on_global, b_block_data_on_global),
make_multi_index(e_block_data_on_global, b_block_data_on_global));
// blockwise copy
blockwise_copy.Run(p_col_global, p_img_global);
}
};
} // namespace ck
#endif
#ifndef CK_DYNAMIC_GRIDWISE_COL2IM_GEMMKGEMMN_NCHW_HPP
#define CK_DYNAMIC_GRIDWISE_COL2IM_GEMMKGEMMN_NCHW_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
namespace ck {
template <typename... In>
__host__ __device__ constexpr auto
map_img_into_col(const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const MultiIndex<2> out_sizes,
const MultiIndex<2> filter_sizes,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t Ho = out_sizes[I0];
const index_t Wo = out_sizes[I1];
const index_t Y = filter_sizes[I0];
const index_t X = filter_sizes[I1];
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(
DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return in_gemmk_gemmn_global_desc;
}
template <index_t BlockSize,
index_t GemmKPerBlock,
index_t GemmNPerBlock,
typename BlockCopySubLengths_GemmK_GemmN,
typename BlockCopyClusterLengths_GemmK_GemmN,
typename BlockCopyThreadClusterArrangeOrder,
typename BlockCopySrcAccessOrder,
typename BlockCopyDstAccessOrder,
index_t BlockCopyDataPerAccess_GemmN>
struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
{
template <typename... Col, typename... Img>
__device__ void Run(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t GemmK = col_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = col_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
// blockwise atomic accumulation
auto blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>(
make_multi_index(0, gemmn_block_data_on_global),
make_multi_index(0, gemmn_block_data_on_global));
auto col_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(col_gemmk_gemmn_global_desc, make_multi_index(0, 0));
auto img_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(img_gemmk_gemmn_global_desc, make_multi_index(0, 0));
const auto col_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step(
col_gemmk_gemmn_global_desc, make_multi_index(GemmKPerBlock, 0));
const auto img_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step(
img_gemmk_gemmn_global_desc, make_multi_index(GemmKPerBlock, 0));
for(index_t gemmk = 0; gemmk < GemmK - GemmKPerBlock; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(p_col_global, p_img_global);
move_dynamic_tensor_coordinate(
col_gemmk_gemmn_global_desc, col_gemmk_gemmn_coord, col_gemmk_gemmn_coord_step);
move_dynamic_tensor_coordinate(
img_gemmk_gemmn_global_desc, img_gemmk_gemmn_coord, img_gemmk_gemmn_coord_step);
}
}
};
} // namespace ck
#endif
#ifndef CK_CLUSTER_DESCRIPTOR_HPP
#define CK_CLUSTER_DESCRIPTOR_HPP
#include "common_header.hpp"
// TODO remove dependency on deprecated tensor descriptor
#include "tensor_descriptor.hpp"
namespace ck {
// a cluster map 1d index to N-d index
template <typename Lengths, typename ArrangeOrder>
struct ClusterDescriptor
{
static constexpr index_t nDim = Lengths::Size();
static constexpr auto mDesc = transform_tensor_descriptor(
make_native_tensor_descriptor_packed(Lengths{}),
make_tuple(Merge<decltype(Lengths::ReorderGivenNew2Old(ArrangeOrder{}))>{}),
make_tuple(ArrangeOrder{}),
make_tuple(Sequence<0>{}));
__host__ __device__ constexpr ClusterDescriptor()
{
static_assert(Lengths::Size() == nDim && ArrangeOrder::Size() == nDim,
"wrong! size not the same");
static_assert(is_valid_sequence_map<ArrangeOrder>{}, "wrong! ArrangeOrder is wrong");
}
__host__ __device__ static constexpr index_t GetElementSize() { return mDesc.GetElementSize(); }
__host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d)
{
return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d});
}
};
template <typename Lengths,
typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ __device__ constexpr auto make_cluster_descriptor(
Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
{
return ClusterDescriptor<Lengths, decltype(order)>{};
}
} // namespace ck
#endif
...@@ -21,9 +21,8 @@ __host__ __device__ constexpr auto ...@@ -21,9 +21,8 @@ __host__ __device__ constexpr auto
make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_diff_visible); make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_diff_visible);
template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep> template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
__host__ __device__ void move_dynamic_tensor_coordinate(const TensorDesc& tensor_desc, __host__ __device__ constexpr void move_dynamic_tensor_coordinate(
TensorCoord& coord, const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordStep& coord_step);
const TensorCoordStep& coord_step);
template <typename TensorDesc, typename TensorCoord> template <typename TensorDesc, typename TensorCoord>
__host__ __device__ constexpr bool __host__ __device__ constexpr bool
...@@ -129,7 +128,7 @@ struct DynamicTensorDescriptor ...@@ -129,7 +128,7 @@ struct DynamicTensorDescriptor
{ {
static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension"); static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
#if 0 // debug #if 1 // debug
return make_dynamic_tensor_coordinate(*this, idx).GetOffset(); return make_dynamic_tensor_coordinate(*this, idx).GetOffset();
#else #else
constexpr index_t ntransform = GetNumOfTransform(); constexpr index_t ntransform = GetNumOfTransform();
...@@ -509,7 +508,7 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d ...@@ -509,7 +508,7 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
} }
template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep> template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
__host__ __device__ void move_dynamic_tensor_coordinate(const TensorDesc& tensor_desc, __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDesc& tensor_desc,
TensorCoord& coord, TensorCoord& coord,
const TensorCoordStep& coord_step) const TensorCoordStep& coord_step)
{ {
...@@ -608,5 +607,13 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& ...@@ -608,5 +607,13 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc&
coordinate_has_valid_offset_assuming_visible_index_is_valid(tensor_desc, coord); coordinate_has_valid_offset_assuming_visible_index_is_valid(tensor_desc, coord);
} }
template <typename TensorDesc>
using DynamicTensorCoordinate_t = decltype(
make_dynamic_tensor_coordinate(TensorDesc{}, MultiIndex<TensorDesc::GetNumOfDimension()>{}));
template <typename TensorDesc>
using DynamicTensorCoordinateStep_t = decltype(make_dynamic_tensor_coordinate_step(
TensorDesc{}, MultiIndex<TensorDesc::GetNumOfDimension()>{}));
} // namespace ck } // namespace ck
#endif #endif
...@@ -172,41 +172,5 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript ...@@ -172,41 +172,5 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
return make_native_tensor_descriptor(new_lengths, new_strides); return make_native_tensor_descriptor(new_lengths, new_strides);
} }
// a cluster map 1d index to N-d index
template <typename Lengths, typename ArrangeOrder>
struct ClusterDescriptor
{
static constexpr index_t nDim = Lengths::Size();
static constexpr auto mDesc = transform_tensor_descriptor(
make_native_tensor_descriptor_packed(Lengths{}),
make_tuple(Merge<decltype(Lengths::ReorderGivenNew2Old(ArrangeOrder{}))>{}),
make_tuple(ArrangeOrder{}),
make_tuple(Sequence<0>{}));
__host__ __device__ constexpr ClusterDescriptor()
{
static_assert(Lengths::Size() == nDim && ArrangeOrder::Size() == nDim,
"wrong! size not the same");
static_assert(is_valid_sequence_map<ArrangeOrder>{}, "wrong! ArrangeOrder is wrong");
}
__host__ __device__ static constexpr index_t GetElementSize() { return mDesc.GetElementSize(); }
__host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d)
{
return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d});
}
};
template <typename Lengths,
typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ __device__ constexpr auto make_cluster_descriptor(
Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
{
return ClusterDescriptor<Lengths, decltype(order)>{};
}
} // namespace ck } // namespace ck
#endif #endif
#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
typename BlockSrcDesc,
typename BlockDstDesc,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectoReadDim,
index_t DstVectorWriteDim,
index_t SrcDataPerRead,
index_t DstDataPerWrite,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride,
index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v1
{
static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1(const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
{
static_assert(nDim == BlockSrcDesc::GetNumOfDimension() &&
nDim == BlockDstDesc::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
"wrong! BlockSize too small");
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
threadwise_read_.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
threadwise_read_.SetDstSliceOrigin(make_zero_multi_index<nDim>());
threadwise_write_.SetSrcSliceOrigin(make_zero_multi_index<nDim>());
threadwise_write_.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
}
}
__device__ void RunLoad(const BlockSrcData* p_block_src)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.Run(p_block_src, p_thread_buffer_);
}
}
__device__ void RunStore(BlockDstData* p_block_dst)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_write_.Run(p_thread_buffer_, p_block_dst);
}
}
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
RunLoad(p_block_src, p_thread_buffer_);
// if there is type conversion, it's done during store
RunStore(p_thread_buffer_, p_block_dst);
}
}
__device__ void MoveSrcSliceWindow(const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.MoveSrcSliceWindow(step);
}
}
__device__ void MoveDstSliceWindow(const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_write_.MoveDstSliceWindow(step);
}
}
private:
static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
static constexpr auto thread_buffer_desc_ =
make_dynamic_native_tensor_descriptor_packed<nDim>(to_multi_index(ThreadSliceLengths{}));
using ThreadwiseRead = ThreadwiseDynamicTensorSliceTransfer_v1<BlockSrcDesc,
decltype(thread_buffer_desc_),
ThreadSliceLengths,
SrcDimAccessOrder,
SrcVectoReadDim,
SrcDataPerRead,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1<decltype(thread_buffer_desc_),
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorWriteDim,
1,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>;
ThreadwiseRead threadwise_read_;
ThreadwiseWrite threadwise_write_;
static constexpr index_t thread_buffer_element_size_ = thread_buffer_desc_.GetElementSpace();
BlockSrcData p_thread_buffer_[thread_buffer_element_size_];
};
} // namespace ck
#endif
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp" #include "tensor_coordinate.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp"
namespace ck { namespace ck {
......
#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
namespace ck {
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorAccessDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
__host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
const SrcDesc& src_desc,
const DynamicTensorCoordinate_t<SrcDesc>& src_origin_coord,
const SrcData* p_src,
const DstDesc& dst_desc,
const DynamicTensorCoordinate_t<DstDesc>& dst_origin_coord,
DstData* p_dst)
{
auto src_coord = src_origin_coord;
auto dst_coord = dst_origin_coord;
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t J0 = SliceLengths{}[0];
constexpr index_t J1 = SliceLengths{}[1];
bool forward_dim0 = true;
bool forward_dim1 = true;
// hardcoded for 2d loop for now
#pragma unroll 1
for(int j0 = 0; j0 < J0; ++j0)
{
#pragma unroll 1
for(int j1 = 0; j1 < J1; ++j1)
{
// do work
p_dst[dst_coord.GetOffset()] = p_src[src_coord.GetOffset()];
// move dim1 iterator
if(j1 < J1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_p1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_m1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_m1);
}
}
}
// switch dim1 iteration direction
forward_dim1 = !forward_dim1;
// move dim0 iterator
if(j0 < J0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_p1_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_m1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_m1_0);
}
}
}
}
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorAccessDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
struct ThreadwiseDynamicTensorSliceTransfer_v1
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1() = default;
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_desc_(src_desc),
src_slice_origin_(src_slice_origin),
dst_desc_(dst_desc),
dst_slice_origin_(dst_slice_origin)
{
}
template <typename SrcData, typename DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
{
threadwise_dynamic_tensor_slice_transfer_v1<SrcData,
DstData,
SrcDesc,
DstDesc,
SliceLengths,
SrcDstDimAccessOrder,
SrcDstVectorAccessDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
src_desc_, src_slice_origin_, p_src, dst_desc_, dst_slice_origin_, p_dst);
}
__device__ void SetSrcSliceOrigin(const Index& src_slice_origin_idx)
{
src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc_, src_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const Index& dst_slice_origin_idx)
{
dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc_, dst_slice_origin_idx);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const Index& src_slice_origin_step_idx)
{
const auto src_slice_origin_step =
make_dynamic_tensor_coordinate_step(src_desc_, src_slice_origin_step_idx);
move_dynamic_tensor_coordinate(src_desc_, src_slice_origin_, src_slice_origin_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const Index& dst_slice_origin_step_idx)
{
const auto dst_slice_origin_step =
make_dynamic_tensor_coordinate_step(dst_desc_, dst_slice_origin_step_idx);
move_dynamic_tensor_coordinate(dst_desc_, dst_slice_origin_, dst_slice_origin_step);
}
private:
const SrcDesc& src_desc_;
const DstDesc& dst_desc_;
SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_;
};
} // namespace ck
#endif
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dynamic_gridwise_col2im_eb_nchw.hpp"
template <typename T,
typename ColDesc,
typename ImgDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_dynamic_col2im_eb_nchw(ColDesc,
const Tensor<T>& col_eb,
ImgDesc,
Tensor<T>& img_nchw,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr auto col_eb_desc = ColDesc{};
constexpr auto img_nchw_desc = ImgDesc{};
constexpr index_t N = img_nchw_desc.GetLengths()[0];
constexpr index_t C = img_nchw_desc.GetLengths()[1];
constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
constexpr index_t E = col_eb_desc.GetLengths()[0];
constexpr index_t B = col_eb_desc.GetLengths()[1];
std::size_t data_sz = sizeof(T);
DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
col_eb_device_buf.ToDevice(col_eb.mData.data());
img_nchw_device_buf.ToDevice(img_nchw.mData.data());
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t EPerBlock = 128;
constexpr index_t BPerBlock = 128;
using BlockCopySubLengths_E_B = Sequence<8, 8>;
using BlockCopyClusterLengths_E_B = Sequence<16, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t BlockCopyDataPerAccess_B = 1;
#endif
constexpr index_t GridSize =
((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_col2im =
DynamicGridwiseCol2Im_eb_nchw<GridSize,
BlockSize,
T,
ColDesc,
ImgDesc,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
EPerBlock,
BPerBlock,
BlockCopySubLengths_E_B,
BlockCopyClusterLengths_E_B,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_B>{};
for(index_t i = 0; i < 1; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
const T* const __restrict__,
T* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
const_cast<const T* const __restrict__>(
static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
std::cout << "Average time : " << ave_time << " ms" << std::endl;
}
img_nchw_device_buf.FromDevice(img_nchw.mData.data());
}
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp"
template <typename T,
typename ColDesc,
typename ImgDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc,
const Tensor<T>& col_gemmk_gemmn,
ImgDesc,
Tensor<T>& img_n_c_hi_wi,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
std::size_t data_sz = sizeof(T);
DeviceMem col_gemmk_gemmn_device_buf(data_sz * col_gemmk_gemmn.mDesc.GetElementSpace());
DeviceMem img_n_c_hi_wi_device_buf(data_sz * img_n_c_hi_wi.mDesc.GetElementSpace());
col_gemmk_gemmn_device_buf.ToDevice(col_gemmk_gemmn.mData.data());
img_n_c_hi_wi_device_buf.ToDevice(img_n_c_hi_wi.mData.data());
const auto col_gemmk_gemmn_desc = make_dynamic_native_tensor_descriptor<2>(
to_multi_index(ColDesc::GetLengths()), to_multi_index(ColDesc::GetStrides()));
const auto img_n_c_hi_wi_desc = make_dynamic_native_tensor_descriptor<4>(
to_multi_index(ImgDesc::GetLengths()), to_multi_index(ImgDesc::GetStrides()));
const auto filter_sizes = to_multi_index(FilterSizes{});
const auto out_sizes = to_multi_index(OutputSizes{});
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
const auto img_gemmk_gemmn_desc = map_img_into_col(img_n_c_hi_wi_desc,
out_sizes,
filter_sizes,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const index_t GemmN = col_gemmk_gemmn_desc.GetLength(I1);
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t GemmKPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
using BlockCopySubLengths_GemmK_GemmN = Sequence<8, 8>;
using BlockCopyClusterLengths_GemmK_GemmN = Sequence<16, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
constexpr index_t BlockCopyDataPerAccess_GemmN = 1;
#endif
const index_t GridSize = GemmN / GemmNPerBlock;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_col2im =
DynamicGridwiseCol2Im_gemmkgemmn_nchw<BlockSize,
GemmKPerBlock,
GemmNPerBlock,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_GemmN>{};
for(index_t i = 0; i < 1; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
const T* const __restrict__,
T* const __restrict__,
decltype(col_gemmk_gemmn_desc),
decltype(img_gemmk_gemmn_desc)>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
const_cast<const T* const __restrict__>(
static_cast<T*>(col_gemmk_gemmn_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(img_n_c_hi_wi_device_buf.GetDeviceBuffer())),
col_gemmk_gemmn_desc,
img_gemmk_gemmn_desc);
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
std::cout << "Average time : " << ave_time << " ms" << std::endl;
}
img_n_c_hi_wi_device_buf.FromDevice(img_n_c_hi_wi.mData.data());
}
...@@ -12,45 +12,30 @@ ...@@ -12,45 +12,30 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "host_col2im.hpp" #include "host_col2im.hpp"
#include "device_col2im_eb_nchw.hpp" #include "device_col2im_eb_nchw.hpp"
#include "device_dynamic_col2im_eb_nchw.hpp" #include "device_dynamic_col2im_gemmkgemmn_nchw.hpp"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 0
constexpr index_t N = 2; // 3x3, 71x71
constexpr index_t C = 8; constexpr index_t N = 128;
constexpr index_t HI = 8; constexpr index_t C = 192;
constexpr index_t WI = 8; constexpr index_t HI = 71;
constexpr index_t K = 128; constexpr index_t WI = 71;
constexpr index_t Y = 4;
constexpr index_t X = 4;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<2, 2>;
#elif 0
// 3x3, 34x34
constexpr index_t N = 64;
constexpr index_t C = 256;
constexpr index_t HI = 34;
constexpr index_t WI = 34;
constexpr index_t K = 128; constexpr index_t K = 128;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<1, 1>;
#elif 0 #elif 0
// 1x1 filter, 8x8 image // 1x1, 8x8
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 128;
constexpr index_t N = 64;
constexpr index_t C = 1536; constexpr index_t C = 1536;
constexpr index_t HI = 8; constexpr index_t HI = 8;
constexpr index_t WI = 8; constexpr index_t WI = 8;
...@@ -64,13 +49,12 @@ int main(int argc, char* argv[]) ...@@ -64,13 +49,12 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 8x8 image // 1x1, 73x73
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 2048; constexpr index_t C = 160;
constexpr index_t HI = 8; constexpr index_t HI = 73;
constexpr index_t WI = 8; constexpr index_t WI = 73;
constexpr index_t K = 384; constexpr index_t K = 64;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
...@@ -80,31 +64,104 @@ int main(int argc, char* argv[]) ...@@ -80,31 +64,104 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 7x7 image // 3x3, 35x35
// cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 832; constexpr index_t C = 96;
constexpr index_t HI = 7; constexpr index_t HI = 35;
constexpr index_t WI = 7; constexpr index_t WI = 35;
constexpr index_t K = 384; constexpr index_t K = 96;
constexpr index_t Y = 1; constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 71x71
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 71;
constexpr index_t WI = 71;
constexpr index_t K = 192;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 7x1, 17x17
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 7;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>;
#elif 0
// 1x7, 17x17
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 7;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 3>;
using RightPads = Sequence<0, 3>;
#elif 0
// 3x3, 299x299 stride=2
constexpr index_t N = 128;
constexpr index_t C = 3;
constexpr index_t HI = 299;
constexpr index_t WI = 299;
constexpr index_t K = 32;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 8x8 image // 3x3, 147x147
// cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 1280; constexpr index_t C = 32;
constexpr index_t HI = 8; constexpr index_t HI = 147;
constexpr index_t WI = 8; constexpr index_t WI = 147;
constexpr index_t K = 384; constexpr index_t K = 64;
constexpr index_t Y = 1; constexpr index_t Y = 3;
constexpr index_t X = 1; constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 149x149
constexpr index_t N = 128;
constexpr index_t C = 32;
constexpr index_t HI = 149;
constexpr index_t WI = 149;
constexpr index_t K = 32;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
...@@ -112,77 +169,147 @@ int main(int argc, char* argv[]) ...@@ -112,77 +169,147 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 14x14 image // 3x3, 17x17, stride 2
// cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 512; constexpr index_t C = 192;
constexpr index_t HI = 14; constexpr index_t HI = 17;
constexpr index_t WI = 14; constexpr index_t WI = 17;
constexpr index_t K = 128; constexpr index_t K = 192;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 35x35
constexpr index_t N = 128;
constexpr index_t C = 384;
constexpr index_t HI = 35;
constexpr index_t WI = 35;
constexpr index_t K = 96;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
// 3x3, 35x35, stride 2
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 35;
constexpr index_t WI = 35;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 8x8 image // 1x3, 8x8
// cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% constexpr index_t N = 128;
constexpr index_t N = 64; constexpr index_t C = 384;
constexpr index_t C = 1536;
constexpr index_t HI = 8; constexpr index_t HI = 8;
constexpr index_t WI = 8; constexpr index_t WI = 8;
constexpr index_t K = 384; constexpr index_t K = 448;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 1>;
using RightPads = Sequence<0, 1>;
#elif 0
// 3x1, 8x8
constexpr index_t N = 128;
constexpr index_t C = 448;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 512;
constexpr index_t Y = 3;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 0>;
using RightPads = Sequence<1, 0>;
#elif 0
// 3x3, 147x147
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 147;
constexpr index_t WI = 147;
constexpr index_t K = 96;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 28x28 image // 7x1, 73x73
// cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 256; constexpr index_t C = 64;
constexpr index_t HI = 28; constexpr index_t HI = 73;
constexpr index_t WI = 28; constexpr index_t WI = 73;
constexpr index_t K = 128; constexpr index_t K = 64;
constexpr index_t Y = 1; constexpr index_t Y = 7;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>;
#elif 0
// 3x3, 73x73
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 73;
constexpr index_t WI = 73;
constexpr index_t K = 96;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 7x7 image // 1x1, 14x14, stride 2
// cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 832; constexpr index_t C = 1024;
constexpr index_t HI = 7; constexpr index_t HI = 14;
constexpr index_t WI = 7; constexpr index_t WI = 14;
constexpr index_t K = 256; constexpr index_t K = 2048;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 17x17 input // 1x1, 14x14
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 768; constexpr index_t C = 1024;
constexpr index_t HI = 17; constexpr index_t HI = 14;
constexpr index_t WI = 17; constexpr index_t WI = 14;
constexpr index_t K = 128; constexpr index_t K = 256;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
...@@ -192,63 +319,104 @@ int main(int argc, char* argv[]) ...@@ -192,63 +319,104 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 14x14 image // 1x1, 14x14, stride 2
// cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 528; constexpr index_t C = 1024;
constexpr index_t HI = 14; constexpr index_t HI = 14;
constexpr index_t WI = 14; constexpr index_t WI = 14;
constexpr index_t K = 128; constexpr index_t K = 512;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1
// 3x3, 28x28
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0 #elif 0
// 1x1 filter, 14x14 image // 3x3, 14x14
// cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 528; constexpr index_t C = 256;
constexpr index_t HI = 14; constexpr index_t HI = 14;
constexpr index_t WI = 14; constexpr index_t WI = 14;
constexpr index_t K = 256; constexpr index_t K = 256;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 1x1, 56x56, stride 2
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 56;
constexpr index_t WI = 56;
constexpr index_t K = 128;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 1x1 filter, 7x7 image // 7x7, 230x230 stride=2
// cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 832; constexpr index_t C = 3;
constexpr index_t HI = 7; constexpr index_t HI = 230;
constexpr index_t WI = 7; constexpr index_t WI = 230;
constexpr index_t K = 128; constexpr index_t K = 64;
constexpr index_t Y = 7;
constexpr index_t X = 7;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1, 28x28, stride = 2
constexpr index_t N = 128;
constexpr index_t C = 512;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 1024;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // 1x1, 28x28, stride 2
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 288; constexpr index_t C = 512;
constexpr index_t HI = 35; constexpr index_t HI = 28;
constexpr index_t WI = 35; constexpr index_t WI = 28;
constexpr index_t K = 384; constexpr index_t K = 256;
constexpr index_t Y = 3; constexpr index_t Y = 1;
constexpr index_t X = 3; constexpr index_t X = 1;
using ConvStrides = Sequence<2, 2>; using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
...@@ -256,50 +424,65 @@ int main(int argc, char* argv[]) ...@@ -256,50 +424,65 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 5x5 filter, 2x2 pad, 7x7 input // 1x1, 7x7
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 48; constexpr index_t C = 512;
constexpr index_t HI = 7; constexpr index_t HI = 7;
constexpr index_t WI = 7; constexpr index_t WI = 7;
constexpr index_t K = 128; constexpr index_t K = 2048;
constexpr index_t Y = 5; constexpr index_t Y = 1;
constexpr index_t X = 5; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<2, 2>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<2, 2>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
// 7x1 filter, 3x0 pad, 17x17 input // 3x3, 7x7
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 512;
constexpr index_t HI = 17; constexpr index_t HI = 7;
constexpr index_t WI = 17; constexpr index_t WI = 7;
constexpr index_t K = 128; constexpr index_t K = 512;
constexpr index_t Y = 7; constexpr index_t Y = 3;
constexpr index_t X = 1; constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<3, 0>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<3, 0>; using RightPads = Sequence<1, 1>;
#elif 1 #elif 0
// 1x7 filter, 0x3 pad, 17x17 input // 1x1, 56x56
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 64;
constexpr index_t HI = 17; constexpr index_t HI = 56;
constexpr index_t WI = 17; constexpr index_t WI = 56;
constexpr index_t K = 128; constexpr index_t K = 64;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 7; constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 3>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 3>; using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 56x56
constexpr index_t N = 128;
constexpr index_t C = 64;
constexpr index_t HI = 56;
constexpr index_t WI = 56;
constexpr index_t K = 64;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#endif #endif
constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{}); constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
...@@ -362,8 +545,8 @@ int main(int argc, char* argv[]) ...@@ -362,8 +545,8 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#else #elif 0
device_dynamic_col2im_eb_nchw(col_eb_desc, device_dynamic_col2im_gemmkgemmn_nchw(col_eb_desc,
col_eb, col_eb,
img_nchw_desc, img_nchw_desc,
img_nchw_device, img_nchw_device,
...@@ -376,6 +559,34 @@ int main(int argc, char* argv[]) ...@@ -376,6 +559,34 @@ int main(int argc, char* argv[])
nrepeat); nrepeat);
#endif #endif
#if 0
constexpr auto lengths = to_multi_index(Sequence<1,2>{});
constexpr auto thread_buffer_desc_ =
make_dynamic_native_tensor_descriptor_packed<2>(lengths);
#elif 1
constexpr auto lengths = to_multi_index(Sequence<1, 2>{});
constexpr index_t NDim = 2;
constexpr auto transforms = make_tuple(DynamicUnMerge<NDim>{lengths});
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
constexpr auto up_dim_hidden_idss =
make_tuple(typename arithmetic_sequence_gen<1, NDim + 1, 1>::type{});
constexpr auto visible_dim_hidden_ids =
typename arithmetic_sequence_gen<1, NDim + 1, 1>::type{};
constexpr index_t element_space_size =
container_reduce(lengths, math::multiplies<index_t>{}, index_t{1});
constexpr auto desc =
DynamicTensorDescriptor<decltype(transforms),
decltype(low_dim_hidden_idss),
decltype(up_dim_hidden_idss),
decltype(
visible_dim_hidden_ids)>{}; //{transforms, element_space_size};
#endif
if(do_verification) if(do_verification)
{ {
host_col2im(col_eb, host_col2im(col_eb,
......
...@@ -549,7 +549,7 @@ int main(int argc, char* argv[]) ...@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
#endif #endif
} }
#if 1 #if 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -561,7 +561,7 @@ int main(int argc, char* argv[]) ...@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -598,7 +598,7 @@ int main(int argc, char* argv[]) ...@@ -598,7 +598,7 @@ int main(int argc, char* argv[])
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 1
device_dummy_dynamic_transform_v2(in_nchw_desc, device_dummy_dynamic_transform(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
wei_kcyx, wei_kcyx,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment