Commit c01af899 authored by Chao Liu's avatar Chao Liu
Browse files

added new tensor copy operator

parent e9733a9f
...@@ -295,27 +295,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -295,27 +295,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
// do work // do work
for(index_t e = 0; e < E; e += EPerBlock) for(index_t e = 0; e < E; e += EPerBlock)
{ {
#if 0 // debug
blockwise_in_copy.Run(p_in_global, p_in_block); blockwise_in_copy.Run(p_in_global, p_in_block);
blockwise_wei_copy.Run(p_wei_global, p_wei_block); blockwise_wei_copy.Run(p_wei_global, p_wei_block);
#else
using InSrcMergedDimSubLengthsHack = Sequence<InBlockCopySubLengths_E_N1_B_N2{}[0],
1,
InBlockCopySubLengths_E_N1_B_N2{}[2],
1>;
using InDstMergedDimSubLengthsHack = Sequence<1, 1, 1, 1>;
blockwise_in_copy.Run_hack(p_in_global,
p_in_block,
InSrcMergedDimSubLengthsHack{},
InDstMergedDimSubLengthsHack{});
using WeiSrcMergedDimSubLengthsHack = Sequence<1, 1>;
using WeiDstMergedDimSubLengthsHack = Sequence<1, 1>;
blockwise_wei_copy.Run_hack(p_wei_global,
p_wei_block,
WeiSrcMergedDimSubLengthsHack{},
WeiDstMergedDimSubLengthsHack{});
#endif
__syncthreads(); __syncthreads();
...@@ -391,10 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -391,10 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
arithmetic_sequence_gen<0, 8, 1>::type{}, arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{}); Number<1>{});
#else #else
using OutSrcMergedDimSliceLengthsHack = Sequence<1, 1, 1, 1, 1, 1, 1, 1>;
using OutDstMergedDimSliceLengthsHack = Sequence<1, 1, 1, 1, 1, 1, 1, 1>;
ThreadwiseGenericTensorSliceCopy_v2< ThreadwiseGenericTensorSliceCopy_v2<
Float, Float,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
...@@ -403,10 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -403,10 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>, MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>( decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
.Run_hack(p_out_thread, .Run(p_out_thread, p_out_thread_on_global);
p_out_thread_on_global,
OutSrcMergedDimSliceLengthsHack{},
OutDstMergedDimSliceLengthsHack{});
#endif #endif
} }
} }
......
...@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not satisfied"); "GemmDataPerReadB alignment requirement is not satisfied");
#if 1 // debug
// input blockwise copy // input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor // slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in // this copy operator already has blockwise offset built-in
...@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
InBlockCopySrcDataPerRead_B, InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2>( InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#else
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize,
Float,
decltype(in_e_n1_b_n2_global_merged_desc),
decltype(in_e_n1_b_n2_block_desc),
MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>,
NormalTensorCoordinate<decltype(in_e_n1_b_n2_block_desc)>,
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#endif
// weight tensor // weight tensor
// tensor descriptor in device memory, src of blockwise copy // tensor descriptor in device memory, src of blockwise copy
...@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence<EPerBlock, KPerBlock>{}, Sequence<EPerBlock, KPerBlock>{},
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{}); Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
#if 1 // debug
// operator for blockwise copy of weight into LDS // operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor // slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in // this copy operator already have blockwise offset built-in
...@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
WeiBlockCopySrcDataPerRead_E, WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>( WeiBlockCopyDstDataPerWrite_K>(
{0, k_block_data_on_global}, {0, 0}); {0, k_block_data_on_global}, {0, 0});
#else
auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize,
Float,
decltype(wei_e_k_global_desc),
decltype(wei_e_k_block_desc),
NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
NormalTensorCoordinate<decltype(wei_e_k_block_desc)>,
decltype(wei_e_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
#endif
// GEMM definition // GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx // c_mtx += transpose(a_mtx) * b_mtx
...@@ -291,54 +319,61 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -291,54 +319,61 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float* p_wei_block_next = Float* p_wei_block_next =
even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 1
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#else
blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
#endif
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard); blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global,
p_wei_register_clipboard); p_wei_register_buffer);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next);
p_in_block_next); blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next);
} }
} }
// LDS double buffer: tail // LDS double buffer: tail
{ {
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
// even iteration // even iteration
#if 1
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#else
blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
#endif
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global, p_in_register_clipboard); blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer);
p_wei_register_clipboard);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer,
p_wei_block_double + wei_block_space); p_wei_block_double + wei_block_space);
// odd iteration // odd iteration
__syncthreads(); __syncthreads();
...@@ -396,6 +431,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -396,6 +431,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 1 // debug
threadwise_generic_tensor_slice_copy_v1( threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread, p_out_thread,
...@@ -406,6 +442,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -406,6 +442,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{}, arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{}); Number<1>{});
#else
ThreadwiseGenericTensorSliceCopy_v2<
Float,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
NormalTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc)>,
MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
.Run(p_out_thread, p_out_thread_on_global);
#endif
} }
} }
}; };
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
namespace ck { namespace ck {
// B = merge(N, H, W) // B = merge(N, Ho, Wo)
template <index_t GridSize, template <index_t GridSize,
index_t BlockSize, index_t BlockSize,
class Float, class Float,
...@@ -237,24 +237,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -237,24 +237,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for(index_t e_block_data_begin = 0; e_block_data_begin < E; e_block_data_begin += EPerBlock) for(index_t e_block_data_begin = 0; e_block_data_begin < E; e_block_data_begin += EPerBlock)
{ {
#if 0
blockwise_in_copy.Run(p_in_global, p_in_block); blockwise_in_copy.Run(p_in_global, p_in_block);
blockwise_wei_copy.Run(p_wei_global, p_wei_block); blockwise_wei_copy.Run(p_wei_global, p_wei_block);
#else
using InSrcMergedDimSubLengthsHack = InBlockCopySubLengths_E_B;
using InDstMergedDimSubLengthsHack = Sequence<1, 1>;
blockwise_in_copy.Run_hack(p_in_global,
p_in_block,
InSrcMergedDimSubLengthsHack{},
InDstMergedDimSubLengthsHack{});
using WeiSrcMergedDimSubLengthsHack = Sequence<1, 1>;
using WeiDstMergedDimSubLengthsHack = Sequence<1, 1>;
blockwise_wei_copy.Run_hack(p_wei_global,
p_wei_block,
WeiSrcMergedDimSubLengthsHack{},
WeiDstMergedDimSubLengthsHack{});
#endif
__syncthreads(); __syncthreads();
...@@ -318,17 +302,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -318,17 +302,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
{ {
#if 0
threadwise_out_copy.Run(p_out_thread, p_out_global); threadwise_out_copy.Run(p_out_thread, p_out_global);
#else
using OutSrcMergedDimSubLengthsHack = Sequence<1, 1, 1>;
using OutDstMergedDimSubLengthsHack =
Sequence<1, 1, OutThreadCopySliceLengths{}[2]>;
threadwise_out_copy.Run_hack(p_out_thread,
p_out_global,
OutSrcMergedDimSubLengthsHack{},
OutDstMergedDimSubLengthsHack{});
#endif
threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true); threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true);
threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true); threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true);
......
...@@ -24,7 +24,7 @@ template <index_t BlockSize, ...@@ -24,7 +24,7 @@ template <index_t BlockSize,
class DstDesc, class DstDesc,
class SliceLengths, class SliceLengths,
class SubLengths, class SubLengths,
class DataClusterLengths, class ThreadClusterLengths,
class ThreadClusterArrangeOrder, class ThreadClusterArrangeOrder,
class SrcAccessOrder, class SrcAccessOrder,
class DstAccessOrder, class DstAccessOrder,
...@@ -65,7 +65,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -65,7 +65,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
// check NDim consistency // check NDim consistency
static_assert(nDim == SrcDesc::GetNumOfDimension() && static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() && nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(), nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
"wrong"); "wrong");
...@@ -78,13 +79,13 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -78,13 +79,13 @@ struct BlockwiseGenericTensorSliceCopy_v1
// thread cluster // thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
DataClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize // BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize"); static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work // divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{}; constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim) { static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0, static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
...@@ -160,9 +161,9 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -160,9 +161,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0)); mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
} }
__device__ static constexpr index_t GetRegisterClipboardSize() __device__ static constexpr index_t GetRegisterBufferSize()
{ {
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths); make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
...@@ -170,14 +171,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -170,14 +171,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
return thread_tensor_desc.GetElementSpace(); return thread_tensor_desc.GetElementSpace();
} }
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const Float* __restrict__ p_Buffer) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{}; constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
...@@ -187,25 +189,24 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -187,25 +189,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto src_thread_data_multi_id_begin = constexpr auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; repeat_multi_id * data_per_cluster_per_dims;
constexpr auto clipboard_data_multi_id_begin = constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
repeat_multi_id * thread_sub_tensor_lengths;
constexpr index_t src_offset = constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
constexpr index_t clipboard_offset = constexpr index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
#else #else
ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) { ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset = const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset = const index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
#endif #endif
// By position the origin of the per-thread window at the point, where multi-index // By position the origin of the per-thread window at the point, where multi-index
...@@ -219,7 +220,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -219,7 +220,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
p_src + src_offset + mThreadSrcOffset, p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
thread_tensor_desc, thread_tensor_desc,
p_clipboard + clipboard_offset, p_Buffer + Buffer_offset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
SrcAccessOrder{}, SrcAccessOrder{},
...@@ -227,38 +228,38 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -227,38 +228,38 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
} }
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_Buffer,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{}; constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
constexpr auto clipboard_data_multi_id_begin = constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
constexpr index_t clipboard_offset = constexpr index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
constexpr index_t dst_offset = constexpr index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else #else
ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) { ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset = const index_t Buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#endif #endif
...@@ -271,7 +272,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -271,7 +272,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// If in the future, you want to enable SubLengths > 1 at the merged dimension, // If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed // special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc, threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
p_clipboard + clipboard_offset, p_Buffer + Buffer_offset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
DstDesc{}, DstDesc{},
p_dst + dst_offset + mThreadDstOffset, p_dst + dst_offset + mThreadDstOffset,
...@@ -284,10 +285,10 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -284,10 +285,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
Float p_clipboard[GetRegisterClipboardSize()]; Float p_Buffer[GetRegisterBufferSize()];
RunLoadRegisterClipboard(p_src, p_clipboard); RunLoadRegisterBuffer(p_src, p_Buffer);
RunStoreRegisterClipboard(p_clipboard, p_dst); RunStoreRegisterBuffer(p_Buffer, p_dst);
} }
// When moving the slicing windows along a merged dimension, if the strides of the // When moving the slicing windows along a merged dimension, if the strides of the
...@@ -382,24 +383,30 @@ template <index_t BlockSize, ...@@ -382,24 +383,30 @@ template <index_t BlockSize,
class DstCoordinate, class DstCoordinate,
class SliceLengths, class SliceLengths,
class SubLengths, class SubLengths,
class DataClusterLengths, class ThreadClusterLengths,
class ThreadClusterArrangeOrder> class ThreadClusterArrangeOrder>
struct BlockwiseGenericTensorSliceCopy_v2 struct BlockwiseGenericTensorSliceCopy_v2
{ {
using ThreadwiseCopy = ThreadwiseGenericTensorSliceCopy_v2<TData,
SrcDesc,
DstDesc,
SrcCoordinate,
DstCoordinate,
SubLengths>;
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin, __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
DstCoordinate dst_block_slice_origin) DstCoordinate dst_block_slice_origin)
{ {
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize(),
"wrong! nDim not consistent");
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
DataClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths");
const auto thread_cluster_multi_id = const auto thread_cluster_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
...@@ -409,43 +416,66 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -409,43 +416,66 @@ struct BlockwiseGenericTensorSliceCopy_v2
const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{}; const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
mThreadwiseCopy.SetSrcSliceOrigin(src_block_slice_origin + thread_data_multi_id_begin); mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_multi_id_begin);
mThreadwiseCopy.SetDstSliceOrigin(dst_block_slice_origin + thread_data_multi_id_begin); mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_multi_id_begin);
} }
__device__ void Run(const TData* p_src, TData* p_dst) const __device__ static constexpr index_t GetRegisterBufferSize()
{ {
mThreadwiseCopy.Run(p_src, p_dst); return RegisterBufferDesc::GetElementSpace();
} }
template <class SrcMergedDimSubLengthsHack, class DstMergedDimSubLengthsHack> __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
__device__ void Run_hack(const TData* p_src,
TData* p_dst,
SrcMergedDimSubLengthsHack,
DstMergedDimSubLengthsHack) const
{ {
// hacks to isolate merged dimension from normal dimensions, and calculate their offset mThreadwiseLoad.Run(p_src, p_buffer);
// seperately }
// SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
// but 1 on normal dimensions; __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions, {
// but 1 on merged dimensions; mThreadwiseStore.Run(p_buffer, p_dst);
mThreadwiseCopy.Run_hack( }
p_src, p_dst, SrcMergedDimSubLengthsHack{}, DstMergedDimSubLengthsHack{});
__device__ void Run(const TData* p_src, TData* p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseStore.Run(p_buffer, p_dst);
} }
__device__ void MoveSrcSlicingWindow(Array<index_t, nDim> step_sizes, bool positive_direction) __device__ void MoveSrcSlicingWindow(Array<index_t, nDim> step_sizes, bool positive_direction)
{ {
mThreadwiseCopy.MoveSrcSlicingWindow(step_sizes, positive_direction); mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
} }
__device__ void MoveDstSlicingWindow(Array<index_t, nDim> step_sizes, bool positive_direction) __device__ void MoveDstSlicingWindow(Array<index_t, nDim> step_sizes, bool positive_direction)
{ {
mThreadwiseCopy.MoveDstSlicingWindow(step_sizes, positive_direction); mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
} }
// private: private:
ThreadwiseCopy mThreadwiseCopy; using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v2<TData,
SrcDesc,
RegisterBufferDesc,
SrcCoordinate,
NormalTensorCoordinate<RegisterBufferDesc>,
SubLengths>;
using ThreadwiseStore =
ThreadwiseGenericTensorSliceCopy_v2<TData,
RegisterBufferDesc,
DstDesc,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
SubLengths>;
ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore;
}; };
} // namespace ck } // namespace ck
......
...@@ -138,47 +138,17 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -138,47 +138,17 @@ struct ThreadwiseGenericTensorSliceCopy_v2
mDstSliceOrigin = dst_slice_origin; mDstSliceOrigin = dst_slice_origin;
} }
__device__ void Run(const TData* p_src, TData* p_dst) const template <class TDesc, class Seq>
struct IsolateMergedDimSliceLengthsHack
{ {
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); template <class IDim>
__device__ constexpr index_t operator()(IDim idim) const
TData p_buffer_[buffer_desc.GetElementSpace()]; {
TData* p_buffer = p_buffer_; return TDesc::ContainMultipleOriginalDimensions(idim) ? Seq{}[idim] : 1;
}
#if 0 };
static_ford<SliceLengths>{}([&](auto data_id) {
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)] =
p_src[(mSrcSliceOrigin + data_id).GetOffset()];
});
static_ford<SliceLengths>{}([&](auto data_id) {
p_dst[(mDstSliceOrigin + data_id).GetOffset()] =
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)];
});
#elif 1
auto src_slice_origin = mSrcSliceOrigin;
auto dst_slice_origin = mDstSliceOrigin;
const TData* p_src_tmp = p_src + src_slice_origin.RepositionOrigin();
TData* p_dst_tmp = p_dst + dst_slice_origin.RepositionOrigin();
static_ford<SliceLengths>{}([&](auto data_id) {
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)] =
p_src_tmp[(src_slice_origin + data_id).GetOffset()];
});
static_ford<SliceLengths>{}([&](auto data_id) {
p_dst_tmp[(dst_slice_origin + data_id).GetOffset()] =
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)];
});
#endif
}
template <class SrcMergedDimSliceLengthsHack, class DstMergedDimSliceLengthsHack> __device__ void Run(const TData* p_src, TData* p_dst) const
__device__ void Run_hack(const TData* p_src,
TData* p_dst,
SrcMergedDimSliceLengthsHack,
DstMergedDimSliceLengthsHack) const
{ {
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
...@@ -191,6 +161,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -191,6 +161,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions; // but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions, // SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions; // but 1 on merged dimensions;
using SrcMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<SrcDesc, SliceLengths>>::type;
using SrcNormalDimSliceLengthsHack = using SrcNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{}); decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
...@@ -216,6 +190,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -216,6 +190,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions; // but 1 on normal dimensions;
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions, // DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
// but 1 on merged dimensions; // but 1 on merged dimensions;
using DstMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<DstDesc, SliceLengths>>::type;
using DstNormalDimSliceLengthsHack = using DstNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{}); decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{});
......
...@@ -128,48 +128,63 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>> ...@@ -128,48 +128,63 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
using type = Sequence<Xs..., Ys...>; using type = Sequence<Xs..., Ys...>;
}; };
// arithmetic sqeuence // generate sequence
template <index_t IBegin, index_t NSize, index_t Increment> template <index_t IBegin, index_t NRemain, class F>
struct arithmetic_sequence_gen_impl struct sequence_gen_impl
{ {
static constexpr index_t NSizeLeft = NSize / 2; static constexpr index_t NRemainLeft = NRemain / 2;
static constexpr index_t NRemainRight = NRemain - NRemainLeft;
static constexpr index_t IMiddle = IBegin + NRemainLeft;
using type = typename sequence_merge< using type =
typename arithmetic_sequence_gen_impl<IBegin, NSizeLeft, Increment>::type, typename sequence_merge<typename sequence_gen_impl<IBegin, NRemainLeft, F>::type,
typename arithmetic_sequence_gen_impl<IBegin + NSizeLeft * Increment, typename sequence_gen_impl<IMiddle, NRemainRight, F>::type>::type;
NSize - NSizeLeft,
Increment>::type>::type;
}; };
template <index_t IBegin, index_t Increment> template <index_t I, class F>
struct arithmetic_sequence_gen_impl<IBegin, 1, Increment> struct sequence_gen_impl<I, 1, F>
{ {
using type = Sequence<IBegin>; static constexpr index_t Is = F{}(Number<I>{});
using type = Sequence<Is>;
}; };
template <index_t IBegin, index_t Increment> template <index_t I, class F>
struct arithmetic_sequence_gen_impl<IBegin, 0, Increment> struct sequence_gen_impl<I, 0, F>
{ {
using type = Sequence<>; using type = Sequence<>;
}; };
template <index_t NSize, class F>
struct sequence_gen
{
using type = typename sequence_gen_impl<0, NSize, F>::type;
};
// arithmetic sequence
template <index_t IBegin, index_t IEnd, index_t Increment> template <index_t IBegin, index_t IEnd, index_t Increment>
struct arithmetic_sequence_gen struct arithmetic_sequence_gen
{ {
using type = typename arithmetic_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::type; struct F
{
__host__ __device__ constexpr index_t operator()(index_t i) const
{
return i * Increment + IBegin;
}
};
using type = typename sequence_gen<(IEnd - IBegin) / Increment, F>::type;
}; };
// uniform sequence // uniform sequence
template <index_t NSize, index_t I> template <index_t NSize, index_t I>
struct uniform_sequence_gen struct uniform_sequence_gen
{ {
struct return_constant struct F
{ {
__host__ __device__ constexpr index_t operator()(index_t) const { return I; } __host__ __device__ constexpr index_t operator()(index_t) const { return I; }
}; };
using type = decltype( using type = typename sequence_gen<NSize, F>::type;
typename arithmetic_sequence_gen<0, NSize, 1>::type{}.Transform(return_constant{}));
}; };
// reverse inclusive scan (with init) sequence // reverse inclusive scan (with init) sequence
......
...@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = constexpr auto gridwise_conv =
#if 1 #if 0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
#else #else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "tensor.hpp" #include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using namespace ck; using namespace ck;
......
...@@ -532,7 +532,7 @@ int main(int argc, char* argv[]) ...@@ -532,7 +532,7 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0 #elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment