#ifndef CK_BLOCKWISE_4D_TENSOR_OP_HPP #define CK_BLOCKWISE_4D_TENSOR_OP_HPP #include "ConstantTensorDescriptor.hpp" #include "threadwise_tensor_slice_copy.hpp" namespace ck { template __device__ void blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr auto dst_desc = DstDesc{}; constexpr auto desc = make_ConstantTensorDescriptor_packed(dst_desc.GetLengths()); #if 0 if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: "); print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: "); } #endif constexpr index_t NLoop = desc.GetElementSize() / BlockSize; for(index_t iloop = 0; iloop < NLoop; ++iloop) { index_t is = get_thread_local_1d_id() + iloop * BlockSize; const index_t did0 = is / desc.GetStride(I0); is -= did0 * desc.GetStride(I0); const index_t did1 = is / desc.GetStride(I1); is -= did1 * desc.GetStride(I1); const index_t did2 = is / desc.GetStride(I2); is -= did2 * desc.GetStride(I2); const index_t did3 = is / desc.GetStride(I3); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3); f(p_dst[dindex]); } constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize); if(has_tail) { index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < desc.GetElementSize()) { const index_t did0 = is / desc.GetStride(I0); is -= did0 * desc.GetStride(I0); const index_t did1 = is / desc.GetStride(I1); is -= did1 * desc.GetStride(I1); const index_t did2 = is / desc.GetStride(I2); is -= did2 * desc.GetStride(I2); const index_t did3 = is / desc.GetStride(I3); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3); f(p_dst[dindex]); } } } // Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3] // TODO: in order to optimize mem access for different mem type, // need to write specialized version template __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths, MapDst2Src, F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr index_t IR0 = MapDst2Src{}.Get(I0); constexpr index_t IR1 = MapDst2Src{}.Get(I1); constexpr index_t IR2 = MapDst2Src{}.Get(I2); constexpr index_t IR3 = MapDst2Src{}.Get(I3); constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(SrcOpLengths{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; for(index_t iloop = 0; iloop < NLoop; ++iloop) { index_t is = get_thread_local_1d_id() + iloop * BlockSize; index_t did[4]; did[0] = is / ref_desc.GetStride(I0); is -= did[0] * ref_desc.GetStride(I0); did[1] = is / ref_desc.GetStride(I1); is -= did[1] * ref_desc.GetStride(I1); did[2] = is / ref_desc.GetStride(I2); is -= did[2] * ref_desc.GetStride(I2); did[3] = is / ref_desc.GetStride(I3); const index_t src_index = src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]); f(p_src[src_index], p_dst[dst_index]); } constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); if(has_tail) { index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < ref_desc.GetElementSize()) { index_t did[4]; did[0] = is / ref_desc.GetStride(I0); is -= did[0] * ref_desc.GetStride(I0); did[1] = is / ref_desc.GetStride(I1); is -= did[1] * ref_desc.GetStride(I1); did[2] = is / ref_desc.GetStride(I2); is -= did[2] * ref_desc.GetStride(I2); did[3] = is / ref_desc.GetStride(I3); const index_t src_index = src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]); f(p_src[src_index], p_dst[dst_index]); } } } template __device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst) { auto f_set_zero = [](Float& v) { v = Float(0); }; blockwise_4d_tensor_pointwise_operation_unary(DstDesc{}, p_dst, f_set_zero); } template __device__ void blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths, MapDst2Src) { auto f_copy = [](const Float& src, Float& dst) { dst = src; }; blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy); } template struct Blockwise4dTensorCopy1 { using vector_t = typename vector_type::MemoryType; __device__ constexpr Blockwise4dTensorCopy1() { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; static_assert(DataPerRead == 1 || (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 && DstDesc{}.GetStride(I2) % DataPerRead == 0, "src and dst stride2 should be multiple of DataPerRead to keep alignment"); // we allow out-of-bound read from src in D3 dimension, // but we need to make sure dst stride2 is big enough, // so that the out-of-bound write won't contaminate next line in dst constexpr index_t L3 = CopyLengths{}.Get(I3); constexpr index_t read_per_d3 = math::integer_divide_ceil(L3, DataPerRead); static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2), "wrong! out-of-bound write will contaminate next line!\n"); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L3 = CopyLengths{}.Get(I3); constexpr index_t read_per_d3 = math::integer_divide_ceil(L3, DataPerRead); constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(Sequence{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; auto f_copy = [&](index_t is) { index_t did[4]; did[0] = is / ref_desc.GetStride(I0); is -= did[0] * ref_desc.GetStride(I0); did[1] = is / ref_desc.GetStride(I1); is -= did[1] * ref_desc.GetStride(I1); did[2] = is / ref_desc.GetStride(I2); is -= did[2] * ref_desc.GetStride(I2); did[3] = is / ref_desc.GetStride(I3); const index_t src_index = src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead); const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead); *(reinterpret_cast(p_dst + dst_index)) = *(reinterpret_cast(p_src + src_index)); }; for(index_t iloop = 0; iloop < NLoop; ++iloop) { index_t is = get_thread_local_1d_id() + iloop * BlockSize; f_copy(is); } constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); if(has_tail) { index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < ref_desc.GetElementSize()) { f_copy(is); } } } }; template struct BlockwiseChwnTensorCopyPadded { __device__ void Run(const Float* __restrict__ p_src, index_t c_block_data_begin, index_t ho_block_data_begin, index_t wo_block_data_begin, index_t n_block_data_begin, Float* __restrict__ p_dst, index_t h_block_pad_low, index_t w_block_pad_low, index_t h_block_pad_up, index_t w_block_pad_up) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(DstOpLengths{}); constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0); constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = p_src + src_desc.GetOffsetFromMultiIndex( c_block_data_begin, (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(src_desc, "src_desc: "); print_ConstantTensorDescriptor(dst_desc, "dst_desc: "); print_ConstantTensorDescriptor(ref_desc, "ref_desc: "); printf("%u %u, \t" "h_global_pad_low %u w_global_pad_low %u \t" "h_block_pad_low %u w_block_pad_low %u h_block_pad_up %u w_block_pad_up %u \t" "\n", get_block_1d_id(), get_thread_local_1d_id(), h_global_pad_low, w_global_pad_low, h_block_pad_low, w_block_pad_low, h_block_pad_up, w_block_pad_up); } #endif for(index_t iloop = 0; iloop < NLoop; ++iloop) { index_t is = get_thread_local_1d_id() + iloop * BlockSize; index_t did[4]; did[0] = is / ref_desc.GetStride(I0); is -= did[0] * ref_desc.GetStride(I0); did[1] = is / ref_desc.GetStride(I1); is -= did[1] * ref_desc.GetStride(I1); did[2] = is / ref_desc.GetStride(I2); is -= did[2] * ref_desc.GetStride(I2); did[3] = is / ref_desc.GetStride(I3); const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); p_dst[bindex] = (did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) || did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2)) ? Float(0) : p_src_tmp[src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3])]; } constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); if(has_tail) { index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < ref_desc.GetElementSize()) { index_t did[4]; did[0] = is / ref_desc.GetStride(I0); is -= did[0] * ref_desc.GetStride(I0); did[1] = is / ref_desc.GetStride(I1); is -= did[1] * ref_desc.GetStride(I1); did[2] = is / ref_desc.GetStride(I2); is -= did[2] * ref_desc.GetStride(I2); did[3] = is / ref_desc.GetStride(I3); const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); p_dst[bindex] = (did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) || did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2)) ? Float(0) : p_src_tmp[src_desc.GetOffsetFromMultiIndex( did[0], did[1], did[2], did[3])]; } } } }; // starting point need to be aligned to float4 or float2 or float // stride3 need to be 1 for both source and destination template struct Blockwise4dTensorCopy3 { using vector_t = typename vector_type::MemoryType; index_t mSrcMyThreadOffset; index_t mDstMyThreadOffset; __device__ Blockwise4dTensorCopy3() { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; static_assert(DataPerRead == 1 || (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); static_assert( SrcDesc{}.GetStride(I2) % DataPerRead == 0 && DstDesc{}.GetStride(I2) % DataPerRead == 0, "wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment"); constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L3 = CopyLengths{}.Get(I3); constexpr index_t thread_per_d0 = ThreadPerDims{}.Get(I0); constexpr index_t thread_per_d1 = ThreadPerDims{}.Get(I1); constexpr index_t thread_per_d2 = ThreadPerDims{}.Get(I2); constexpr index_t thread_per_d3 = ThreadPerDims{}.Get(I3); // we allow out-of-bound read from src in D3 dimension, // but we need to make sure dst stride is big enough, // so that the out-of-bound write won't contaminate next line in dst constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); static_assert(nloop_d3 * thread_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2), "wrong! out-of-bound write will contaminate next line!\n"); static_assert(L0 % thread_per_d0 == 0 && L1 % thread_per_d1 == 0 && L2 % thread_per_d2 == 0, "wrong! L0, L1, L2 should be divided evenly!\n"); static_assert(BlockSize >= thread_per_d0 * thread_per_d1 * thread_per_d2 * thread_per_d3, "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = accumulate_on_sequence(ThreadPerDims{}, math::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { if(get_thread_local_1d_id() >= num_active_thread) { return; } } constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(ThreadPerDims{}); const auto thread_multi_id = thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0], thread_multi_id[1], thread_multi_id[2], thread_multi_id[3] * DataPerRead); mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0], thread_multi_id[1], thread_multi_id[2], thread_multi_id[3] * DataPerRead); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L3 = CopyLengths{}.Get(I3); constexpr index_t thread_per_d0 = ThreadPerDims{}.Get(I0); constexpr index_t thread_per_d1 = ThreadPerDims{}.Get(I1); constexpr index_t thread_per_d2 = ThreadPerDims{}.Get(I2); constexpr index_t thread_per_d3 = ThreadPerDims{}.Get(I3); constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1 * thread_per_d2 * thread_per_d3; if(BlockSize > num_active_thread) { if(get_thread_local_1d_id() >= num_active_thread) { return; } } constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); #pragma unroll for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0) { #pragma unroll for(index_t iloop_d1 = 0; iloop_d1 < nloop_d1; ++iloop_d1) { #pragma unroll for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) { #pragma unroll for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) { const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex( iloop_d0 * thread_per_d0, iloop_d1 * thread_per_d1, iloop_d2 * thread_per_d2, iloop_d3 * thread_per_d3 * DataPerRead); const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( iloop_d0 * thread_per_d0, iloop_d1 * thread_per_d1, iloop_d2 * thread_per_d2, iloop_d3 * thread_per_d3 * DataPerRead); *(reinterpret_cast(&p_dst[dst_offset + mDstMyThreadOffset])) = *(reinterpret_cast( &p_src[src_offset + mSrcMyThreadOffset])); } } } } } __device__ constexpr index_t GetRegisterClipboardSize() const { static_assert(is_same::value, "wrong! only support float!\n"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L3 = CopyLengths{}.Get(I3); constexpr index_t thread_per_d0 = ThreadPerDims{}.Get(I0); constexpr index_t thread_per_d1 = ThreadPerDims{}.Get(I1); constexpr index_t thread_per_d2 = ThreadPerDims{}.Get(I2); constexpr index_t thread_per_d3 = ThreadPerDims{}.Get(I3); constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3; } __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, Float* __restrict__ p_clipboard) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L3 = CopyLengths{}.Get(I3); constexpr index_t thread_per_d0 = ThreadPerDims{}.Get(I0); constexpr index_t thread_per_d1 = ThreadPerDims{}.Get(I1); constexpr index_t thread_per_d2 = ThreadPerDims{}.Get(I2); constexpr index_t thread_per_d3 = ThreadPerDims{}.Get(I3); constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1 * thread_per_d2 * thread_per_d3; if(BlockSize > num_active_thread) { if(get_thread_local_1d_id() >= num_active_thread) { return; } } constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed( Sequence{}); #pragma unroll for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0) { #pragma unroll for(index_t iloop_d1 = 0; iloop_d1 < nloop_d1; ++iloop_d1) { #pragma unroll for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) { #pragma unroll for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) { const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex( iloop_d0 * thread_per_d0, iloop_d1 * thread_per_d1, iloop_d2 * thread_per_d2, iloop_d3 * thread_per_d3 * DataPerRead); const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex( iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead); *(reinterpret_cast(&p_clipboard[clipboard_offset])) = *(reinterpret_cast( &p_src[src_offset + mSrcMyThreadOffset])); } } } } } __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, Float* __restrict__ p_dst) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L3 = CopyLengths{}.Get(I3); constexpr index_t thread_per_d0 = ThreadPerDims{}.Get(I0); constexpr index_t thread_per_d1 = ThreadPerDims{}.Get(I1); constexpr index_t thread_per_d2 = ThreadPerDims{}.Get(I2); constexpr index_t thread_per_d3 = ThreadPerDims{}.Get(I3); constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1 * thread_per_d2 * thread_per_d3; if(BlockSize > num_active_thread) { if(get_thread_local_1d_id() >= num_active_thread) { return; } } constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed( Sequence{}); #pragma unroll for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0) { #pragma unroll for(index_t iloop_d1 = 0; iloop_d1 < nloop_d1; ++iloop_d1) { #pragma unroll for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) { #pragma unroll for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) { const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex( iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead); const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( iloop_d0 * thread_per_d0, iloop_d1 * thread_per_d1, iloop_d2 * thread_per_d2, iloop_d3 * thread_per_d3 * DataPerRead); *(reinterpret_cast(&p_dst[dst_offset + mDstMyThreadOffset])) = *(reinterpret_cast(&p_clipboard[clipboard_offset])); } } } } } }; template struct Blockwise4dTensorCopyReorder1 { __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { auto f_copy = [](const Float& src, Float& dst) { dst = src; }; blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy); } }; } // namespace #endif