Commit 41cdde99 authored by Chao Liu's avatar Chao Liu
Browse files

add looping Orders into ford and static_ford

parent 0271338e
...@@ -199,7 +199,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -199,7 +199,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
} }
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src, __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_Buffer) const Float* __restrict__ p_buffer) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
...@@ -216,24 +216,24 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -216,24 +216,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto src_thread_data_multi_id_begin = constexpr auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; repeat_multi_id * data_per_cluster_per_dims;
constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; constexpr auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
constexpr index_t src_offset = constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
constexpr index_t Buffer_offset = constexpr index_t buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
#else #else
ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) { ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset = const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t Buffer_offset = const index_t buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
#endif #endif
// By position the origin of the per-thread window at the point, where multi-index // By position the origin of the per-thread window at the point, where multi-index
...@@ -247,7 +247,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -247,7 +247,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
p_src + src_offset + mThreadSrcOffset, p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
thread_tensor_desc, thread_tensor_desc,
p_Buffer + Buffer_offset, p_buffer + buffer_offset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
SrcAccessOrder{}, SrcAccessOrder{},
...@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}); });
} }
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_Buffer, __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_buffer,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
...@@ -270,23 +270,23 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -270,23 +270,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; constexpr auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
constexpr index_t Buffer_offset = constexpr index_t buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
constexpr index_t dst_offset = constexpr index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else #else
ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) { ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t Buffer_offset = const index_t buffer_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#endif #endif
...@@ -299,7 +299,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -299,7 +299,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// If in the future, you want to enable SubLengths > 1 at the merged dimension, // If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed // special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc, threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
p_Buffer + Buffer_offset, p_buffer + buffer_offset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
DstDesc{}, DstDesc{},
p_dst + dst_offset + mThreadDstOffset, p_dst + dst_offset + mThreadDstOffset,
...@@ -312,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -312,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
Float p_Buffer[GetRegisterBufferSize()]; Float p_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer(p_src, p_Buffer); RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_Buffer, p_dst); RunStoreRegisterBuffer(p_buffer, p_dst);
} }
// When moving the slicing windows along a merged dimension, if the strides of the // When moving the slicing windows along a merged dimension, if the strides of the
......
...@@ -24,105 +24,120 @@ struct is_static<Sequence<Is...>> : integral_constant<bool, true> ...@@ -24,105 +24,120 @@ struct is_static<Sequence<Is...>> : integral_constant<bool, true>
}; };
// RemainLengths: Sequence<...> // RemainLengths: Sequence<...>
template <class RemainLengths> // Orders: Sequence<...>
template <class RemainLengths, class Orders>
struct static_ford_impl struct static_ford_impl
{ {
// F signature: F(Sequence<...> multi_id) __host__ __device__ constexpr static_ford_impl()
// CurrentMultiIndex: Sequence<...>
template <class F, class CurrentMultiIndex>
__host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
{ {
static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here"); static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
}
// F signature: F(Sequence<...>)
// CurrentOrderedId: Sequence<...>
template <class F, class CurrentOrderedId>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const
{
static_for<0, RemainLengths::Front(), 1>{}([=](auto I) { static_for<0, RemainLengths::Front(), 1>{}([=](auto I) {
static_ford_impl<decltype(RemainLengths::PopFront())>{}(f, static_ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
CurrentMultiIndex::PushBack(I)); f, CurrentOrderedId::PushBack(I));
}); });
} }
}; };
template <> template <class Orders>
struct static_ford_impl<Sequence<>> struct static_ford_impl<Sequence<>, Orders>
{ {
// F signature: F(Sequence<...> multi_id) // F signature: F(Sequence<...>)
// CurrentMultiIndex: Sequence<...> // OrderedId: Sequence<...>
template <class F, class CurrentMultiIndex> template <class F, class OrderedId>
__host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const __host__ __device__ constexpr void operator()(F f, OrderedId) const
{ {
f(CurrentMultiIndex{}); // retrive unordered Id
f(OrderedId::ReorderGivenOld2New(Orders{}));
} }
}; };
// Lengths is Sequence<...> // Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
template <class Lengths> // Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
// dimension
template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
struct static_ford struct static_ford
{ {
__host__ __device__ constexpr static_ford()
{
static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
}
// F signature: F(Sequence<...> multi_id) // F signature: F(Sequence<...> multi_id)
// multi_id is the unordered multi-index
template <class F> template <class F>
__host__ __device__ constexpr void operator()(F f) const __host__ __device__ constexpr void operator()(F f) const
{ {
static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
static_ford_impl<decltype(ordered_lengths), Orders>{}(f, Sequence<>{});
static_ford_impl<Lengths>{}(f, Sequence<>{});
} }
}; };
template <index_t RemainDim> // RemainLengths: Sequence<...>
// Orders: Sequence<...>
template <class RemainLengths, class Orders>
struct ford_impl struct ford_impl
{ {
// F signature: F(Array<...> multi_id) __host__ __device__ constexpr ford_impl()
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template <class F, class CurrentMultiIndex, class RemainLengths>
__host__ __device__ constexpr void
operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
{ {
static_assert(RemainLengths::GetSize() == RemainDim, "wrong!"); static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
static_assert(RemainDim > 1, "wrong!"); }
constexpr auto next_length = RemainLengths{}.Front();
for(index_t i = 0; i < next_length; ++i) // F signature: F(Array<...> multi_id)
// CurrentOrderdId: Array<...>
template <class F, class CurrentOrderedId>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
{
for(index_t i = 0; i < RemainLengths::Front(); ++i)
{ {
ford_impl<RemainDim - 1>{}(f, current_multi_id.PushBack(i), RemainLengths{}.PopFront()); ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
f, current_ordered_id.PushBack(i));
} }
} }
}; };
template <> template <class Orders>
struct ford_impl<1> struct ford_impl<Sequence<>, Orders>
{ {
// F signature: F(Array<...> multi_id) // F signature: F(Array<...> multi_id)
// CurrentMultiIndex: Array<...> // CurrentOrderdId: Array<...>
// RemainLengths: Sequence<...> template <class F, class CurrentOrderedId>
template <class F, class CurrentMultiIndex, class RemainLengths> __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
__host__ __device__ constexpr void
operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
{
static_assert(RemainLengths::GetSize() == 1, "wrong!");
constexpr index_t last_length = RemainLengths{}.Front();
for(index_t i = 0; i < last_length; ++i)
{ {
f(current_multi_id.PushBack(i)); // retrive unordered Id
} f(reorder_array_given_old2new(current_ordered_id, Orders{}));
} }
}; };
// Lengths is Sequence<...> // Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
template <class Lengths> // Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
// dimension
template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
struct ford struct ford
{ {
__host__ __device__ constexpr ford()
{
static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
}
// F signature: F(Array<...> multi_id) // F signature: F(Array<...> multi_id)
// multi_id is the unordered multi-index
template <class F> template <class F>
__host__ __device__ constexpr void operator()(F f) const __host__ __device__ constexpr void operator()(F f) const
{ {
constexpr index_t first_length = Lengths{}.Front(); for(index_t i = 0; i < Lengths::Front(); ++i)
for(index_t i = 0; i < first_length; ++i)
{ {
ford_impl<Lengths::GetSize() - 1>{}(f, Array<index_t, 1>{i}, Lengths{}.PopFront()); ford_impl<decltype(Lengths::PopFront()), Orders>{}(f, Array<index_t, 1>{i});
} }
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment