Commit bc9ea646 authored by Chao Liu's avatar Chao Liu
Browse files

use ford/for instead of static_ford/static_for in threadwise copy, somehow...

use ford/for instead of static_ford/static_for in threadwise copy, somehow register spill is greatly reduced on AMD
parent 5636576f
...@@ -155,7 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -155,7 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not satisfied"); "GemmDataPerReadB alignment requirement is not satisfied");
#if 1 #if 0
// input blockwise copy // input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor // slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in // this copy operator already has blockwise offset built-in
...@@ -178,7 +178,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -178,7 +178,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
#else #else
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2< auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize, BlockSize,
Float,
decltype(in_e_n1_b_n2_global_merged_desc), decltype(in_e_n1_b_n2_global_merged_desc),
decltype(in_e_n1_b_n2_block_desc), decltype(in_e_n1_b_n2_block_desc),
MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>, MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>,
...@@ -200,7 +199,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -200,7 +199,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence<EPerBlock, KPerBlock>{}, Sequence<EPerBlock, KPerBlock>{},
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{}); Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
#if 1 #if 0
// operator for blockwise copy of weight into LDS // operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor // slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in // this copy operator already have blockwise offset built-in
...@@ -223,7 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -223,7 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
#else #else
auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2< auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize, BlockSize,
Float,
decltype(wei_e_k_global_desc), decltype(wei_e_k_global_desc),
decltype(wei_e_k_block_desc), decltype(wei_e_k_block_desc),
NormalTensorCoordinate<decltype(wei_e_k_global_desc)>, NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
...@@ -326,7 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -326,7 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 1 #if 0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
// True); // True);
...@@ -358,7 +356,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -358,7 +356,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 1 #if 0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
...@@ -439,17 +437,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -439,17 +437,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 0 #if 0
threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread,
{0, 0, 0, 0, 0, 0, 0, 0},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
p_out_thread_on_global,
{0, 0, 0, 0, 0, 0, 0, 0},
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
#elif 0
ThreadwiseGenericTensorSliceCopy_v1r1< ThreadwiseGenericTensorSliceCopy_v1r1<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
...@@ -461,7 +448,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -461,7 +448,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
1, 1,
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>()) 1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
.Run(p_out_thread, p_out_thread_on_global); .Run(p_out_thread, p_out_thread_on_global);
#elif 1 #elif 0
ThreadwiseGenericTensorSliceCopy_v1r2< ThreadwiseGenericTensorSliceCopy_v1r2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
...@@ -471,7 +458,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -471,7 +458,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
1, 1,
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>()) 1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
.Run(p_out_thread, p_out_thread_on_global); .Run(p_out_thread, p_out_thread_on_global);
#elif 0 #elif 1
ThreadwiseGenericTensorSliceCopy_v2< ThreadwiseGenericTensorSliceCopy_v2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
......
...@@ -22,7 +22,6 @@ namespace ck { ...@@ -22,7 +22,6 @@ namespace ck {
// repeat-length on the merged dimension need to be 1. These sanity checks are performed // repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1 // in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize, template <index_t BlockSize,
class Float,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
class SliceLengths, class SliceLengths,
...@@ -202,8 +201,9 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -202,8 +201,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
return GetRegisterBufferDescriptor().GetElementSpace(); return GetRegisterBufferDescriptor().GetElementSpace();
} }
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src, template <class TData>
Float* __restrict__ p_buffer) const __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
...@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
SrcDimAccessOrder{}, SrcDimAccessOrder{},
Number<SrcDataPerAccess>{}); Number<SrcDataPerAccess>{});
#elif 0 #elif 1
ThreadwiseGenericTensorSliceCopy_v1r1< ThreadwiseGenericTensorSliceCopy_v1r1<
SrcDesc, SrcDesc,
decltype(thread_buffer_desc), decltype(thread_buffer_desc),
...@@ -281,8 +281,9 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -281,8 +281,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
} }
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_buffer, template <class TData>
Float* __restrict__ p_dst) const __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
...@@ -333,7 +334,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -333,7 +334,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
DstDimAccessOrder{}, DstDimAccessOrder{},
Number<DstDataPerAccess>{}); Number<DstDataPerAccess>{});
#elif 0 #elif 1
ThreadwiseGenericTensorSliceCopy_v1r1< ThreadwiseGenericTensorSliceCopy_v1r1<
decltype(thread_buffer_desc), decltype(thread_buffer_desc),
DstDesc, DstDesc,
...@@ -360,9 +361,10 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -360,9 +361,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
} }
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const template <class TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{ {
Float p_buffer[GetRegisterBufferSize()]; TData p_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer(p_src, p_buffer); RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst); RunStoreRegisterBuffer(p_buffer, p_dst);
......
...@@ -10,10 +10,18 @@ ...@@ -10,10 +10,18 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#endif #endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif #endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#endif
namespace ck { namespace ck {
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1 // user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
...@@ -216,6 +224,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 ...@@ -216,6 +224,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
src_vector_access_dim, src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access); SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) { static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
constexpr auto src_data_begin_id = src_access_id.Modify( constexpr auto src_data_begin_id = src_access_id.Modify(
src_vector_access_dim, src_vector_access_dim,
...@@ -239,6 +248,31 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 ...@@ -239,6 +248,31 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i]; p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}); });
}); });
#else
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
auto src_data_begin_id = src_access_id;
src_data_begin_id(src_vector_access_dim) =
src_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}
});
#endif
} }
// copy data from buffer to dst // copy data from buffer to dst
...@@ -252,6 +286,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 ...@@ -252,6 +286,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
dst_vector_access_dim, dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) { static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
constexpr auto dst_data_begin_id = dst_access_id.Modify( constexpr auto dst_data_begin_id = dst_access_id.Modify(
dst_vector_access_dim, dst_vector_access_dim,
...@@ -277,6 +312,33 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 ...@@ -277,6 +312,33 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
// store vector into dst // store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data; *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
}); });
#else
ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
auto dst_data_begin_id = dst_access_id;
dst_data_begin_id(dst_vector_access_dim) =
dst_access_id[dst_vector_access_dim] * dst_data_per_access;
vector_t vector_data;
// pack vector from buffer
for(index_t i = 0; i < DstDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
}
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
});
#endif
} }
} }
...@@ -373,7 +435,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 ...@@ -373,7 +435,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify( constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&]( static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) { auto long_vector_access_id) {
...@@ -524,6 +586,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -524,6 +586,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2
using SrcNormalDimSliceLengthsHack = using SrcNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{}); decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
static_ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) { static_ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){}; constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
...@@ -541,6 +604,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -541,6 +604,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset]; p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
}); });
}); });
#else
ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
const index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
});
});
#endif
// DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions, // DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions,
// but 1 on normal dimensions; // but 1 on normal dimensions;
...@@ -553,6 +631,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -553,6 +631,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2
using DstNormalDimSliceLengthsHack = using DstNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{}); decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
static_ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) { static_ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){}; constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
...@@ -570,6 +649,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2 ...@@ -570,6 +649,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset]; p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
}); });
}); });
#else
ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset();
ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
});
});
#endif
} }
// T can be Sequence or Array // T can be Sequence or Array
......
...@@ -8,7 +8,9 @@ ...@@ -8,7 +8,9 @@
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
namespace ck { namespace ck {
......
...@@ -10,7 +10,9 @@ ...@@ -10,7 +10,9 @@
#define CK_USE_AMD_INLINE_ASM 0 #define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
namespace ck { namespace ck {
......
...@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4; constexpr index_t GemmDataPerReadB = 4;
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 4, 1>; using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 2, 2>;
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 4, 4>; using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 8, 2>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
constexpr index_t InBlockCopySrcDataPerRead_B = 4; constexpr index_t InBlockCopySrcDataPerRead_B = 2;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1; constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2;
using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; using WeiBlockCopySubLengths_E_K = Sequence<2, 2>;
using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
...@@ -379,7 +379,7 @@ int main(int argc, char* argv[]) ...@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment