Commit c01af899 authored by Chao Liu's avatar Chao Liu
Browse files

added new tensor copy operator

parent e9733a9f
......@@ -295,27 +295,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
// do work
for(index_t e = 0; e < E; e += EPerBlock)
{
#if 0 // debug
blockwise_in_copy.Run(p_in_global, p_in_block);
blockwise_wei_copy.Run(p_wei_global, p_wei_block);
#else
using InSrcMergedDimSubLengthsHack = Sequence<InBlockCopySubLengths_E_N1_B_N2{}[0],
1,
InBlockCopySubLengths_E_N1_B_N2{}[2],
1>;
using InDstMergedDimSubLengthsHack = Sequence<1, 1, 1, 1>;
blockwise_in_copy.Run_hack(p_in_global,
p_in_block,
InSrcMergedDimSubLengthsHack{},
InDstMergedDimSubLengthsHack{});
using WeiSrcMergedDimSubLengthsHack = Sequence<1, 1>;
using WeiDstMergedDimSubLengthsHack = Sequence<1, 1>;
blockwise_wei_copy.Run_hack(p_wei_global,
p_wei_block,
WeiSrcMergedDimSubLengthsHack{},
WeiDstMergedDimSubLengthsHack{});
#endif
__syncthreads();
......@@ -391,10 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
#else
using OutSrcMergedDimSliceLengthsHack = Sequence<1, 1, 1, 1, 1, 1, 1, 1>;
using OutDstMergedDimSliceLengthsHack = Sequence<1, 1, 1, 1, 1, 1, 1, 1>;
ThreadwiseGenericTensorSliceCopy_v2<
Float,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
......@@ -403,10 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
.Run_hack(p_out_thread,
p_out_thread_on_global,
OutSrcMergedDimSliceLengthsHack{},
OutDstMergedDimSliceLengthsHack{});
.Run(p_out_thread, p_out_thread_on_global);
#endif
}
}
......
......@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not satisfied");
#if 1 // debug
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
......@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#else
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize,
Float,
decltype(in_e_n1_b_n2_global_merged_desc),
decltype(in_e_n1_b_n2_block_desc),
MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>,
NormalTensorCoordinate<decltype(in_e_n1_b_n2_block_desc)>,
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#endif
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
......@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence<EPerBlock, KPerBlock>{},
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
#if 1 // debug
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
......@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>(
{0, k_block_data_on_global}, {0, 0});
#else
auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize,
Float,
decltype(wei_e_k_global_desc),
decltype(wei_e_k_block_desc),
NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
NormalTensorCoordinate<decltype(wei_e_k_block_desc)>,
decltype(wei_e_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
#endif
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
......@@ -291,53 +319,60 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float* p_wei_block_next =
even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 1
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#else
blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
#endif
__syncthreads();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global,
p_wei_register_clipboard);
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global,
p_wei_register_buffer);
// LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
// LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next);
blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next);
blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next);
}
}
// LDS double buffer: tail
{
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
// even iteration
// even iteration
#if 1
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#else
blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
#endif
__syncthreads();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global,
p_wei_register_clipboard);
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer);
// LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
// LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer,
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer,
p_wei_block_double + wei_block_space);
// odd iteration
......@@ -396,6 +431,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 1 // debug
threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread,
......@@ -406,6 +442,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
#else
ThreadwiseGenericTensorSliceCopy_v2<
Float,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
NormalTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc)>,
MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
.Run(p_out_thread, p_out_thread_on_global);
#endif
}
}
};
......
......@@ -11,7 +11,7 @@
namespace ck {
// B = merge(N, H, W)
// B = merge(N, Ho, Wo)
template <index_t GridSize,
index_t BlockSize,
class Float,
......@@ -237,24 +237,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for(index_t e_block_data_begin = 0; e_block_data_begin < E; e_block_data_begin += EPerBlock)
{
#if 0
blockwise_in_copy.Run(p_in_global, p_in_block);
blockwise_wei_copy.Run(p_wei_global, p_wei_block);
#else
using InSrcMergedDimSubLengthsHack = InBlockCopySubLengths_E_B;
using InDstMergedDimSubLengthsHack = Sequence<1, 1>;
blockwise_in_copy.Run_hack(p_in_global,
p_in_block,
InSrcMergedDimSubLengthsHack{},
InDstMergedDimSubLengthsHack{});
using WeiSrcMergedDimSubLengthsHack = Sequence<1, 1>;
using WeiDstMergedDimSubLengthsHack = Sequence<1, 1>;
blockwise_wei_copy.Run_hack(p_wei_global,
p_wei_block,
WeiSrcMergedDimSubLengthsHack{},
WeiDstMergedDimSubLengthsHack{});
#endif
__syncthreads();
......@@ -318,17 +302,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
{
#if 0
threadwise_out_copy.Run(p_out_thread, p_out_global);
#else
using OutSrcMergedDimSubLengthsHack = Sequence<1, 1, 1>;
using OutDstMergedDimSubLengthsHack =
Sequence<1, 1, OutThreadCopySliceLengths{}[2]>;
threadwise_out_copy.Run_hack(p_out_thread,
p_out_global,
OutSrcMergedDimSubLengthsHack{},
OutDstMergedDimSubLengthsHack{});
#endif
threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true);
threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true);
......
......@@ -24,7 +24,7 @@ template <index_t BlockSize,
class DstDesc,
class SliceLengths,
class SubLengths,
class DataClusterLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcAccessOrder,
class DstAccessOrder,
......@@ -65,7 +65,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
// check NDim consistency
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
"wrong");
......@@ -78,13 +79,13 @@ struct BlockwiseGenericTensorSliceCopy_v1
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
DataClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
......@@ -160,9 +161,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr index_t GetRegisterClipboardSize()
__device__ static constexpr index_t GetRegisterBufferSize()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
......@@ -170,14 +171,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_Buffer) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
......@@ -187,25 +189,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims;
constexpr auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
constexpr index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
......@@ -219,7 +220,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(),
thread_tensor_desc,
p_clipboard + clipboard_offset,
p_Buffer + Buffer_offset,
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
SrcAccessOrder{},
......@@ -227,38 +228,38 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_Buffer,
Float* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
constexpr auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
constexpr index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
constexpr index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#endif
......@@ -271,7 +272,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
p_clipboard + clipboard_offset,
p_Buffer + Buffer_offset,
make_zero_array<index_t, nDim>(),
DstDesc{},
p_dst + dst_offset + mThreadDstOffset,
......@@ -284,10 +285,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
Float p_clipboard[GetRegisterClipboardSize()];
Float p_Buffer[GetRegisterBufferSize()];
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
RunLoadRegisterBuffer(p_src, p_Buffer);
RunStoreRegisterBuffer(p_Buffer, p_dst);
}
// When moving the slicing windows along a merged dimension, if the strides of the
......@@ -382,24 +383,30 @@ template <index_t BlockSize,
class DstCoordinate,
class SliceLengths,
class SubLengths,
class DataClusterLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder>
struct BlockwiseGenericTensorSliceCopy_v2
{
using ThreadwiseCopy = ThreadwiseGenericTensorSliceCopy_v2<TData,
SrcDesc,
DstDesc,
SrcCoordinate,
DstCoordinate,
SubLengths>;
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
DstCoordinate dst_block_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize(),
"wrong! nDim not consistent");
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
DataClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths");
const auto thread_cluster_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
......@@ -409,43 +416,66 @@ struct BlockwiseGenericTensorSliceCopy_v2
const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
mThreadwiseCopy.SetSrcSliceOrigin(src_block_slice_origin + thread_data_multi_id_begin);
mThreadwiseCopy.SetDstSliceOrigin(dst_block_slice_origin + thread_data_multi_id_begin);
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_multi_id_begin);
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_multi_id_begin);
}
__device__ void Run(const TData* p_src, TData* p_dst) const
__device__ static constexpr index_t GetRegisterBufferSize()
{
mThreadwiseCopy.Run(p_src, p_dst);
return RegisterBufferDesc::GetElementSpace();
}
template <class SrcMergedDimSubLengthsHack, class DstMergedDimSubLengthsHack>
__device__ void Run_hack(const TData* p_src,
TData* p_dst,
SrcMergedDimSubLengthsHack,
DstMergedDimSubLengthsHack) const
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{
// hacks to isolate merged dimension from normal dimensions, and calculate their offset
// seperately
// SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
mThreadwiseCopy.Run_hack(
p_src, p_dst, SrcMergedDimSubLengthsHack{}, DstMergedDimSubLengthsHack{});
mThreadwiseLoad.Run(p_src, p_buffer);
}
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{
mThreadwiseStore.Run(p_buffer, p_dst);
}
__device__ void Run(const TData* p_src, TData* p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseStore.Run(p_buffer, p_dst);
}
__device__ void MoveSrcSlicingWindow(Array<index_t, nDim> step_sizes, bool positive_direction)
{
mThreadwiseCopy.MoveSrcSlicingWindow(step_sizes, positive_direction);
mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
}
__device__ void MoveDstSlicingWindow(Array<index_t, nDim> step_sizes, bool positive_direction)
{
mThreadwiseCopy.MoveDstSlicingWindow(step_sizes, positive_direction);
mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
}
// private:
ThreadwiseCopy mThreadwiseCopy;
private:
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v2<TData,
SrcDesc,
RegisterBufferDesc,
SrcCoordinate,
NormalTensorCoordinate<RegisterBufferDesc>,
SubLengths>;
using ThreadwiseStore =
ThreadwiseGenericTensorSliceCopy_v2<TData,
RegisterBufferDesc,
DstDesc,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
SubLengths>;
ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore;
};
} // namespace ck
......
......@@ -138,47 +138,17 @@ struct ThreadwiseGenericTensorSliceCopy_v2
mDstSliceOrigin = dst_slice_origin;
}
__device__ void Run(const TData* p_src, TData* p_dst) const
template <class TDesc, class Seq>
struct IsolateMergedDimSliceLengthsHack
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
#if 0
static_ford<SliceLengths>{}([&](auto data_id) {
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)] =
p_src[(mSrcSliceOrigin + data_id).GetOffset()];
});
static_ford<SliceLengths>{}([&](auto data_id) {
p_dst[(mDstSliceOrigin + data_id).GetOffset()] =
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)];
});
#elif 1
auto src_slice_origin = mSrcSliceOrigin;
auto dst_slice_origin = mDstSliceOrigin;
const TData* p_src_tmp = p_src + src_slice_origin.RepositionOrigin();
TData* p_dst_tmp = p_dst + dst_slice_origin.RepositionOrigin();
static_ford<SliceLengths>{}([&](auto data_id) {
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)] =
p_src_tmp[(src_slice_origin + data_id).GetOffset()];
});
static_ford<SliceLengths>{}([&](auto data_id) {
p_dst_tmp[(dst_slice_origin + data_id).GetOffset()] =
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)];
});
#endif
template <class IDim>
__device__ constexpr index_t operator()(IDim idim) const
{
return TDesc::ContainMultipleOriginalDimensions(idim) ? Seq{}[idim] : 1;
}
};
template <class SrcMergedDimSliceLengthsHack, class DstMergedDimSliceLengthsHack>
__device__ void Run_hack(const TData* p_src,
TData* p_dst,
SrcMergedDimSliceLengthsHack,
DstMergedDimSliceLengthsHack) const
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
......@@ -191,6 +161,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
using SrcMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<SrcDesc, SliceLengths>>::type;
using SrcNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
......@@ -216,6 +190,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions;
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
// but 1 on merged dimensions;
using DstMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<DstDesc, SliceLengths>>::type;
using DstNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{});
......
......@@ -128,48 +128,63 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
using type = Sequence<Xs..., Ys...>;
};
// arithmetic sqeuence
template <index_t IBegin, index_t NSize, index_t Increment>
struct arithmetic_sequence_gen_impl
// generate sequence
template <index_t IBegin, index_t NRemain, class F>
struct sequence_gen_impl
{
static constexpr index_t NSizeLeft = NSize / 2;
static constexpr index_t NRemainLeft = NRemain / 2;
static constexpr index_t NRemainRight = NRemain - NRemainLeft;
static constexpr index_t IMiddle = IBegin + NRemainLeft;
using type = typename sequence_merge<
typename arithmetic_sequence_gen_impl<IBegin, NSizeLeft, Increment>::type,
typename arithmetic_sequence_gen_impl<IBegin + NSizeLeft * Increment,
NSize - NSizeLeft,
Increment>::type>::type;
using type =
typename sequence_merge<typename sequence_gen_impl<IBegin, NRemainLeft, F>::type,
typename sequence_gen_impl<IMiddle, NRemainRight, F>::type>::type;
};
template <index_t IBegin, index_t Increment>
struct arithmetic_sequence_gen_impl<IBegin, 1, Increment>
template <index_t I, class F>
struct sequence_gen_impl<I, 1, F>
{
using type = Sequence<IBegin>;
static constexpr index_t Is = F{}(Number<I>{});
using type = Sequence<Is>;
};
template <index_t IBegin, index_t Increment>
struct arithmetic_sequence_gen_impl<IBegin, 0, Increment>
template <index_t I, class F>
struct sequence_gen_impl<I, 0, F>
{
using type = Sequence<>;
};
template <index_t NSize, class F>
struct sequence_gen
{
using type = typename sequence_gen_impl<0, NSize, F>::type;
};
// arithmetic sequence
template <index_t IBegin, index_t IEnd, index_t Increment>
struct arithmetic_sequence_gen
{
using type = typename arithmetic_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::type;
struct F
{
__host__ __device__ constexpr index_t operator()(index_t i) const
{
return i * Increment + IBegin;
}
};
using type = typename sequence_gen<(IEnd - IBegin) / Increment, F>::type;
};
// uniform sequence
template <index_t NSize, index_t I>
struct uniform_sequence_gen
{
struct return_constant
struct F
{
__host__ __device__ constexpr index_t operator()(index_t) const { return I; }
};
using type = decltype(
typename arithmetic_sequence_gen<0, NSize, 1>::type{}.Transform(return_constant{}));
using type = typename sequence_gen<NSize, F>::type;
};
// reverse inclusive scan (with init) sequence
......
......@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i)
{
constexpr auto gridwise_conv =
#if 1
#if 0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
#else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
......
......@@ -4,6 +4,7 @@
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using namespace ck;
......
......@@ -532,7 +532,7 @@ int main(int argc, char* argv[])
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0
#elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment