"vscode:/vscode.git/clone" did not exist on "a75e0acb815868a7cb604f8ba9511e6dd41a7385"
Commit b37cb71f authored by Wen-Heng (Jack) Chung's avatar Wen-Heng (Jack) Chung
Browse files

Enable bwd wrw

parent c5143bca
...@@ -2,465 +2,57 @@ ...@@ -2,465 +2,57 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "tensor_descriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace ck { namespace ck {
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v1
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static constexpr index_t nOriginalDimSrc =
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
static constexpr index_t nOriginalDimDst =
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
// per-thread offset
index_t mThreadSrcOffset;
index_t mThreadDstOffset;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array<index_t, nDim> mThreadSrcPartialOffsets;
Array<index_t, nDim> mThreadDstPartialOffsets;
// multi-id of original tensor
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
Array<index_t, nDim> dst_block_data_id_begin)
{
// check NDim consistency
static_assert(
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong");
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong!");
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
// additional check for merged dimension
static_for<0, nDim, 1>{}([&](auto IDim_) {
// src
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_src =
SrcDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
// dst
static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_dst =
DstDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
// original multi-id
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
src_block_data_id_begin + thread_data_id_begin);
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
dst_block_data_id_begin + thread_data_id_begin);
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
});
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim);
constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
});
// complete offset
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr auto GetRegisterBufferDescriptor()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
}
__device__ static constexpr index_t GetRegisterBufferSize()
{
return GetRegisterBufferDescriptor().GetElementSpace();
}
template <class TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
decltype(thread_buffer_desc),
SubLengths,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcDataPerAccess,
1>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
});
}
template <class TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
DstDesc,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>(
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
});
}
template <class TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{
constexpr auto IDim = Number<IDim_>{};
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
// calculate new partial original multi-id
auto old_src_partial_original_id =
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
auto new_src_partial_original_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
constexpr auto IDimOriginal = src_partial_original_dims[I];
mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
const index_t new_src_partial_offset =
src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto) {
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
static_if<PositiveDirection>{}([&](auto fwd) {
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
}).Else([&](auto fwd) {
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
});
});
}
template <class T, bool PositiveDirection>
__device__ void
MoveSrcSlicingWindow(T step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
static_for<0, nDim, 1>{}([&](auto idim) {
if(step_sizes[idim] != 0)
{
MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
}
});
}
};
template <index_t BlockSize, template <index_t BlockSize,
class SrcDesc, typename BlockSrcDesc,
class DstDesc, typename BlockDstDesc,
class SrcCoordinate, typename BlockSliceLengths,
class DstCoordinate, typename ThreadSliceLengths,
class SliceLengths, typename ThreadClusterLengths,
class SubLengths, typename ThreadClusterArrangeOrder,
class ThreadClusterLengths, typename SrcDimAccessOrder,
class ThreadClusterArrangeOrder, typename DstDimAccessOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim, index_t SrcVectorAccessDim,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
index_t DstDataPerAccess> index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v2 struct BlockwiseGenericTensorSliceCopy_v4
{ {
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
DstCoordinate dst_block_slice_origin) __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
{ const Index& dst_block_slice_origin)
static_assert(nDim == SrcDesc::GetNumOfDimension() && {
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && static_assert(nDim == BlockSrcDesc::GetNumOfDimension() &&
nDim == SubLengths::GetSize() && nDim == BlockDstDesc::GetNumOfDimension() &&
nDim == ThreadClusterLengths::GetSize() && nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterArrangeOrder::GetSize(), nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent"); "wrong! nDim not consistent");
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{}, static_assert(
"wrong! threads should be mapped to cover entire slicing window"); is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( // map threads to cluster
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths"); "wrong! BlockSize not consistent with ThreadClusterLengths");
const auto thread_cluster_id = const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());
const auto data_cluster_id = const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin); mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>()); mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
...@@ -469,438 +61,166 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -469,438 +61,166 @@ struct BlockwiseGenericTensorSliceCopy_v2
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin); mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
} }
__device__ static constexpr index_t GetRegisterBufferSize() __device__ static constexpr index_t GetThreadBufferSize()
{ {
return RegisterBufferDesc::GetElementSpace(); return ThreadBufferDesc::GetElementSpace();
} }
template <class TData> template <typename BlockSrcData,
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const typename ThreadBufferData,
AddressSpace BlockSrcAddressSpace,
AddressSpace ThreadBufferAddressSpace>
__device__ void
RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer,
integral_constant<AddressSpace, BlockSrcAddressSpace>,
integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
{
constexpr auto block_src_address_space =
integral_constant<AddressSpace, BlockSrcAddressSpace>{};
constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseLoad.Run_optimized_src_address_calculation(
p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
}
else
{
mThreadwiseLoad.Run(
p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
}
}
template <typename BlockSrcData, typename ThreadBufferData>
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
RunLoadThreadBuffer(
p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
}
template <typename ThreadBufferData,
typename BlockDstData,
AddressSpace ThreadBufferAddressSpace,
AddressSpace BlockDstAddressSpace>
__device__ void
RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, ThreadBufferAddressSpace>,
integral_constant<AddressSpace, BlockDstAddressSpace>) const
{
constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
constexpr auto block_dst_address_space =
integral_constant<AddressSpace, BlockDstAddressSpace>{};
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseStore.Run_optimized_dst_address_calculation(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
}
else
{
mThreadwiseStore.Run(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
}
}
template <typename ThreadBufferData, typename BlockDstData>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
RunStoreThreadBuffer(
p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
}
template <typename BlockSrcData,
typename BlockDstData,
AddressSpace BlockSrcAddressSpace,
AddressSpace BlockDstAddressSpace>
__device__ void
Run(const BlockSrcData* p_block_src,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
{ {
mThreadwiseLoad.Run(p_src, p_buffer); BlockSrcData p_thread_buffer[GetThreadBufferSize()];
}
template <class TData> constexpr auto generic_address_space =
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const integral_constant<AddressSpace, AddressSpace::generic>{};
{
mThreadwiseStore.Run(p_buffer, p_dst); RunLoadThreadBuffer(
p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
// if there is type conversion, it's done during store
RunStoreThreadBuffer(
p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
} }
template <class TData> template <typename BlockSrcData, typename BlockDstData>
__device__ void Run(const TData* p_src, TData* p_dst) const __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
{ {
TData p_buffer[GetRegisterBufferSize()]; constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
mThreadwiseLoad.Run(p_src, p_buffer); Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
mThreadwiseStore.Run(p_buffer, p_dst);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveSrcSlicingWindow(T step_sizes, MoveSrcSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction) integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction); mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveDstSlicingWindow(T step_sizes, MoveDstSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction) integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction); mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
} }
private: private:
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); using ThreadBufferDesc = decltype(make_native_tensor_descriptor_packed(ThreadSliceLengths{}));
using ThreadwiseLoad = using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<BlockSrcDesc,
ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc, ThreadBufferDesc,
RegisterBufferDesc, ThreadSliceLengths,
SrcCoordinate, SrcDimAccessOrder,
NormalTensorCoordinate<RegisterBufferDesc>, SrcVectorAccessDim,
SubLengths, SrcDataPerAccess,
SrcDimAccessOrder, 1>;
SrcDimAccessOrder,
SrcVectorAccessDim, using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<ThreadBufferDesc,
SrcVectorAccessDim, BlockDstDesc,
SrcDataPerAccess, ThreadSliceLengths,
1>; DstDimAccessOrder,
DstVectorAccessDim,
using ThreadwiseStore = 1,
ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc, DstDataPerAccess>;
DstDesc,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
SubLengths,
DstDimAccessOrder,
DstDimAccessOrder,
DstVectorAccessDim,
DstVectorAccessDim,
1,
DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad; ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore; ThreadwiseStore mThreadwiseStore;
}; };
// this will be deprecated
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst
// For now, only support SubLengths[...] == 1 on a merged dimension
template <index_t BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class DataClusterLengths,
class ThreadClusterArrangeOrder,
class SrcAccessOrder,
class DstAccessOrder,
index_t SrcDataPerRead,
index_t DstDataPerWrite>
struct BlockwiseGenericTensorSliceCopy_v1_deprecated
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static constexpr index_t nOriginalDimSrc =
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
static constexpr index_t nOriginalDimDst =
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
// per-thread offset
index_t mThreadSrcOffset;
index_t mThreadDstOffset;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array<index_t, nDim> mThreadSrcPartialOffsets;
Array<index_t, nDim> mThreadDstPartialOffsets;
// multi-id of original tensor
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1_deprecated(
Array<index_t, nDim> src_block_data_multi_id_begin,
Array<index_t, nDim> dst_block_data_multi_id_begin)
{
// check NDim consistency
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
"wrong");
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcAccessOrder>::value &&
is_valid_sequence_map<DstAccessOrder>::value,
"wrong!");
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! block size doesn't match with thread cluster size.");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into sub-tensor");
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
// for now, only support SubLengths == 1 on a merged dimension that constains
// multiple original dimensions
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
static_assert(SubLengths::Get(IDim) == 1 ||
(!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
!DstDesc::ContainMultipleOriginalDimensions(IDim)),
"wrong! only support Sub-Length == 1 on a merged dimension");
});
// calculate mThreadSrcOffset, mThreadDstOffset
const auto thread_cluster_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_multi_id =
reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
// original multi-id
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
src_block_data_multi_id_begin + thread_data_multi_id_begin);
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
dst_block_data_multi_id_begin + thread_data_multi_id_begin);
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr index_t idim = IDim;
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets(idim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
});
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr index_t idim = IDim;
constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim);
constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets(idim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
});
// complete offset
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr index_t GetRegisterClipboardSize()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims;
constexpr auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#else
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset =
SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#endif
threadwise_generic_tensor_slice_copy_v1_deprecated(SrcDesc{},
p_src + src_offset +
mThreadSrcOffset,
make_zero_array<index_t, nDim>(),
thread_tensor_desc,
p_clipboard + clipboard_offset,
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
SrcAccessOrder{},
Number<SrcDataPerRead>{});
});
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
constexpr index_t dst_offset =
DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#endif
threadwise_generic_tensor_slice_copy_v1_deprecated(thread_tensor_desc,
p_clipboard + clipboard_offset,
make_zero_array<index_t, nDim>(),
DstDesc{},
p_dst + dst_offset +
mThreadDstOffset,
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
DstAccessOrder{},
Number<DstDataPerWrite>{});
});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
Float p_clipboard[GetRegisterClipboardSize()];
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{
constexpr auto IDim = Number<IDim_>{};
constexpr index_t idim = IDim;
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
// calculate new partial original multi-id
auto old_src_partial_original_multi_id =
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
auto new_src_partial_original_multi_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_multi_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = src_partial_original_dims.Get(I);
mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[idim];
const index_t new_src_partial_offset =
src_partial_original_desc.GetOffsetFromMultiIndex(
new_src_partial_original_multi_id);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets(idim) = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto) {
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
static_if<PositiveDirection>{}([&](auto fwd) {
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(idim_original) += StepSize;
mThreadSrcPartialOffsets(idim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
}).Else([&](auto fwd) {
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(idim_original) -= StepSize;
mThreadSrcPartialOffsets(idim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
});
});
}
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -7,98 +7,160 @@ ...@@ -7,98 +7,160 @@
namespace ck { namespace ck {
template <class Float, class Matrix> template <typename Float, class Matrix>
__device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread) __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
{ {
for(index_t i = 0; i < Matrix::NRow(); ++i) for(index_t i = 0; i < Matrix::NRow(); ++i)
{ {
for(index_t j = 0; j < Matrix::NCol(); ++j) for(index_t j = 0; j < Matrix::NCol(); ++j)
{ {
const index_t id = Matrix::GetOffsetFromMultiIndex(i, j); const index_t id = Matrix::CalculateOffset(i, j);
p_thread[id] = Float(0); p_thread[id] = Float(0);
} }
} }
} }
template <class Float, template <typename SrcMatrix,
class SrcMatrix, typename DstMatrix,
class DstMatrix, index_t NSliceRow,
index_t NRow, index_t NSliceCol,
index_t NCol, index_t DataPerAccess>
index_t DataPerRead> struct ThreadwiseMatrixSliceCopy
__device__ void threadwise_matrix_copy(SrcMatrix,
const Float* __restrict__ p_src,
DstMatrix,
Float* __restrict__ p_dst,
Sequence<NRow, NCol>,
Number<DataPerRead>)
{ {
static_assert(NCol % DataPerRead == 0, "wrong! should be NCol % == DataPerRead == 0"); __device__ constexpr ThreadwiseMatrixSliceCopy()
{
static_assert(SrcMatrix::RowStride() % DataPerAccess == 0 &&
DstMatrix::RowStride() % DataPerAccess == 0,
"wrong! wrong alignment");
static_assert(NSliceCol % DataPerAccess == 0,
"wrong! should be NSliceCol % DataPerAccess == 0");
}
constexpr auto src_mtx = SrcMatrix{}; template <typename Data>
constexpr auto dst_mtx = DstMatrix{}; __device__ static void Run(const Data* p_src, Data* p_dst)
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
for(index_t i = 0; i < NRow; ++i)
{ {
for(index_t j = 0; j < NCol; j += DataPerRead) using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
for(index_t i = 0; i < NSliceRow; ++i)
{ {
const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j); for(index_t j = 0; j < NSliceCol; j += DataPerAccess)
const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j); {
const index_t src_index = SrcMatrix::CalculateOffset(i, j);
const index_t dst_index = DstMatrix::CalculateOffset(i, j);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) = *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]); *reinterpret_cast<const vector_t*>(&p_src[src_index]);
}
} }
} }
} };
template <class MatrixA, // C += transpose(A) * B
class MatrixB, // Element of matrix can be vectorized data
class MatrixC, template <typename MatrixA, typename MatrixB, typename MatrixC>
bool TransA, struct ThreadwiseGemmTransANormalBNormalC
bool TransB,
bool TransC,
class FloatA,
class FloatB,
class FloatC>
__device__ void threadwise_gemm(MatrixA,
integral_constant<bool, TransA>,
const FloatA* __restrict__ p_a_thread,
MatrixB,
integral_constant<bool, TransB>,
const FloatB* __restrict__ p_b_thread,
MatrixC,
integral_constant<bool, TransC>,
FloatC* __restrict__ p_c_thread)
{ {
static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) { __device__ constexpr ThreadwiseGemmTransANormalBNormalC()
constexpr auto a_mtx = MatrixA{}; {
constexpr auto b_mtx = MatrixB{}; static_assert(MatrixA::NRow() == MatrixB::NRow() && MatrixA::NCol() == MatrixC::NRow() &&
constexpr auto c_mtx = MatrixC{}; MatrixB::NCol() == MatrixC::NCol(),
"wrong!");
}
constexpr index_t M = c_mtx.NRow(); template <typename FloatA, typename FloatB, typename FloatC>
constexpr index_t N = c_mtx.NCol(); __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
constexpr index_t K = a_mtx.NRow(); // A is transposed {
constexpr index_t M = MatrixC::NRow();
constexpr index_t N = MatrixC::NCol();
constexpr index_t K = MatrixA::NRow(); // A is transposed
for(index_t k = 0; k < K; ++k) for(index_t k = 0; k < K; ++k)
{ {
for(index_t i = 0; i < M; ++i) for(index_t m = 0; m < M; ++m)
{ {
for(index_t j = 0; j < N; ++j) for(index_t n = 0; n < N; ++n)
{ {
const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j); const index_t bindex = MatrixB::CalculateOffset(k, n);
const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j); const index_t cindex = MatrixC::CalculateOffset(m, n);
p_c_thread[cindex] += math::inner_product_with_conversion<FloatC>{}( p_c[cindex] +=
p_a_thread[aindex], p_b_thread[bindex]); inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
} }
} }
} }
}).Else([&](auto fwd) { }
// not implemented
static_assert(fwd(false), "wrong! support for this config is not implemented"); #if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
}); template <typename FloatA, typename FloatB, typename FloatC>
} __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
constexpr index_t M = MatrixC::NRow();
constexpr index_t N = MatrixC::NCol();
constexpr index_t K = MatrixA::NRow(); // A is transposed
static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
for(index_t k = 0; k < K; ++k)
{
for(index_t m = 0; m < M; ++m)
{
const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
static_if<N == 2>{}([&](auto) {
const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
__outer_product_1x2(
p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
});
static_if<N == 4>{}([&](auto) {
const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
const index_t bindex_2 = MatrixB::CalculateOffset(k, 2);
const index_t bindex_3 = MatrixB::CalculateOffset(k, 3);
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
__outer_product_1x4(p_a[aindex],
p_b[bindex_0],
p_b[bindex_1],
p_b[bindex_2],
p_b[bindex_3],
p_c[cindex_0],
p_c[cindex_1],
p_c[cindex_2],
p_c[cindex_3]);
});
}
}
}
#endif
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
(is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
(is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
static_if<has_amd_asm>{}([&](auto fwd) {
Run_amd_asm(p_a, p_b, fwd(p_c));
}).Else([&](auto) { Run_source(p_a, p_b, p_c); });
#else
Run_source(p_a, p_b, p_c);
#endif
}
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -2,336 +2,218 @@ ...@@ -2,336 +2,218 @@
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "tensor_descriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp" #include "tensor_coordinate.hpp"
#include "float_types.h"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
namespace ck { namespace ck {
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst. // This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst. // It allows the vector size to be different on src and dst.
// It also allows the vector size to be different on src and dst. // The dimensions of vector access should be the same on src and dst.
// It also allows order of access to be different on src and dst. // The dimension access order should be the same on src and dst.
// It use register as buffer to hold all data moving from src to dst. // It is designed for cases, where one of src and dst is register, and
// It is designed for copying small amount of data, and src and dst are // the other is device memory or LDS
// device memory or LDS. template <typename SrcDesc,
// When copying large amout of data, let's hope compiler will reduce register typename DstDesc,
// used for the buffer. typename SliceLengths,
template <class SrcDesc, typename DimAccessOrder,
class DstDesc, index_t VectorAccessDim,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
index_t DstDataPerAccess> index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r1 struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static constexpr index_t nDim = SliceLengths::GetSize(); static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1( using SrcCoord = typename TensorCoordinate<SrcDesc>::type;
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin) using DstCoord = typename TensorCoordinate<DstDesc>::type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
const Index& dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{ {
static_assert(nDim == SrcDesc::GetNumOfDimension() && static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DimAccessOrder::Size(),
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same"); "wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value && static_assert(is_valid_sequence_map<DimAccessOrder>{}, "wrong! map is not valid");
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}( static_assert(
[&](auto fwd) { SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
static_assert( "wrong! cannot evenly divide");
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1"); // TODO:: sanity-check if vectorized memory access is allowed on src and dst
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
} }
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1() __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
: ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(), : ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>()) make_zero_array<index_t, nDim>())
{ {
} }
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin) __device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin)
{ {
mSrcSliceOrigin = src_slice_origin; mSrcSliceOrigin = src_slice_origin;
} }
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin) __device__ void SetDstSliceOrigin(DstCoord dst_slice_origin)
{ {
mDstSliceOrigin = dst_slice_origin; mDstSliceOrigin = dst_slice_origin;
} }
template <class SrcData, class DstData> // Will do padding check on src data: Read 0 if src data is in padding area.
__device__ void Run(const SrcData* p_src, DstData* p_dst) const // Will do padding check on dst data: No write if dst data is in paddin area.
template <typename SrcData,
typename DstData,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void Run(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{ {
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
SrcData p_src_buffer_[buffer_desc.GetElementSpace()];
SrcData* p_src_buffer = p_src_buffer_;
// copy data from src into src buffer constexpr auto vector_access_dim = Number<VectorAccessDim>{};
{
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{}; constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{}; constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify( constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 constexpr auto long_vector_access_lengths = SliceLengths::Modify(
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) { vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
constexpr auto src_data_begin_id = src_access_id.Modify(
src_vector_access_dim,
src_access_id[src_vector_access_dim] * src_data_per_access);
const index_t src_offset = ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); auto long_vector_access_id) {
// load vector from src // data id w.r.t slicing-window
const src_vector_t src_vector_data = auto long_vector_data_begin_id = long_vector_access_id;
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]); long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// unpack vector into buffer // buffer to hold a src long-vector
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { SrcData p_src_long_vector[long_vector_size];
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(src_vector_access_dim,
i);
constexpr index_t buffer_offset = // zero out buffer
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
p_src_buffer[buffer_offset] = // load data from src to the long-vector buffer
reinterpret_cast<const SrcData*>(&src_vector_data)[i]; for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
}); {
}); auto scalar_id = make_zero_array<index_t, nDim>();
#else scalar_id(vector_access_dim) = i * src_data_per_access;
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
auto src_data_begin_id = src_access_id;
src_data_begin_id(src_vector_access_dim) =
src_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_offset = const index_t buffer_offset = i * src_data_per_access;
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
const src_vector_t src_vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
// unpack vector into buffer // Check src vector's padding situation, only check the first data in this src
for(index_t i = 0; i < SrcDataPerAccess; ++i) // vector. It's user's responsiblity to make sure all data in the src vector
// has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
scalar_id(src_vector_access_dim) = i; #if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
const index_t buffer_offset = __buffer_load<SrcData, SrcDataPerAccess>(
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); fwd(p_src), src_coord.GetOffset(), 0);
#else
p_src_buffer[buffer_offset] = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
reinterpret_cast<const SrcData*>(&src_vector_data)[i]; *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
}
});
#endif #endif
} }).Else([&](auto) {
// src can be all kinds of memory-space.
// copy data from buffer to dst *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
{ *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType; });
}
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{}; }
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
constexpr auto dst_data_begin_id = dst_access_id.Modify(
dst_vector_access_dim,
dst_access_id[dst_vector_access_dim] * dst_data_per_access);
dst_vector_t dst_vector_data;
// pack vector from buffer and type conversion
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(dst_vector_access_dim,
i);
constexpr index_t buffer_offset = // SrcData to DstData conversion
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); DstData p_dst_long_vector[long_vector_size];
// SrcData to DstData type conversion is done here for(index_t i = 0; i < long_vector_size; ++i)
reinterpret_cast<DstData*>(&dst_vector_data)[i] = {
type_convert<DstData>{}(p_src_buffer[buffer_offset]); p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}); }
const index_t dst_offset = // store data from the long-vector buffer to dst
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
// store vector into dst const index_t buffer_offset = i * dst_data_per_access;
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = dst_vector_data;
});
#else
ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
auto dst_data_begin_id = dst_access_id;
dst_data_begin_id(dst_vector_access_dim) =
dst_access_id[dst_vector_access_dim] * dst_data_per_access;
dst_vector_t dst_vector_data; const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
// pack vector from buffer and type conversion // Check dst vector's padding situation, only check the first data in this dst
for(index_t i = 0; i < DstDataPerAccess; ++i) // vector. It's user's responsiblity to make sure all data in the dst vector
// has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
scalar_id(dst_vector_access_dim) = i; #if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>(
const index_t buffer_offset = *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); fwd(p_dst),
dst_coord.GetOffset(),
// SrcData to DstData type conversion is done here 0);
reinterpret_cast<DstData*>(&dst_vector_data)[i] = #else
type_convert<DstData>{}(p_src_buffer[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
} *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = dst_vector_data;
});
#endif #endif
} }).Else([&](auto) {
} // dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
private: *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
Array<index_t, nDim> mSrcSliceOrigin; });
Array<index_t, nDim> mDstSliceOrigin; }
}; }
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template <class SrcDesc,
class DstDesc,
class SliceLengths,
class DimAccessOrder,
index_t VectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r2
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2(
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == DimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
static_assert(
SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}); });
} }
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2() template <typename SrcData, typename DstData>
: ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(), __device__ void Run(const SrcData* p_src, DstData* p_dst) const
make_zero_array<index_t, nDim>())
{ {
} constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin) Run(p_src, p_dst, generic_address_space, generic_address_space);
{
mSrcSliceOrigin = src_slice_origin;
} }
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin) // Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions
template <index_t... Lengths, index_t... Mask>
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
{ {
mDstSliceOrigin = dst_slice_origin; return Sequence<(Mask ? Lengths : 1)...>{};
} }
template <class SrcData, class DstData> // p_src must be global-memory, p_dst can be any memory-space.
__device__ void Run(const SrcData* p_src, DstData* p_dst) const // User should make sure p_src is a block-invariant pointer, because
// buffer_load is used for loading from global-memory into register buffer.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData,
typename DstData,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void
Run_optimized_src_address_calculation(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{ {
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
...@@ -346,81 +228,107 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 ...@@ -346,81 +228,107 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify( constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 // separate linear dimensions from non-linear dimensions
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&]( constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
auto long_vector_access_id) { constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
// data id w.r.t slicing-window static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( long_vector_size == SrcDataPerAccess,
vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
"would drop");
// buffer to hold a long-vector // separate steps into linear and non-linear components, accoording to src tensor
SrcData p_src_long_vector[long_vector_size]; constexpr auto linear_long_vector_access_lengths =
DstData p_dst_long_vector[long_vector_size]; mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
// load data from src to the long-vector buffer constexpr auto nonlinear_long_vector_access_lengths =
static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) { mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
vector_access_dim, i * src_data_per_access);
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
constexpr index_t buffer_offset = i * src_data_per_access; // loop over src's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = auto nonlinear_dim_long_vector_access_id) {
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
});
// type conversion // calculate step-sizes along src's nonlinear dimensions
for(index_t i = 0; i < long_vector_size; ++i) auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
{ nonlinear_dim_data_steps(vector_access_dim) =
p_dst_long_vector[i] = type_convert<DstType>{}(p_src_long_vector[i]); long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
}
// store data from the long-vector buffer to dst // move src cooridnate along nonlinear dimensions
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) { // this coordinate contains run-time per-thread offset
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify( const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
vector_access_dim, i * dst_data_per_access);
constexpr index_t buffer_offset = i * dst_data_per_access; // loop over src's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( // step-sizes along src's linear dimensions
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
});
#else
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
[&](auto long_vector_access_id) {
// data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector // buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size]; SrcData p_src_long_vector[long_vector_size];
DstData p_dst_long_vector[long_vector_size];
// load data from src to the long-vector buffer // zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
// Loop over VectorAccessDim, and load data from src to the
// long-vector buffer.
// If VectorAccessDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be src's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access; scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
const index_t buffer_offset = i * src_data_per_access; const index_t buffer_offset = i * src_data_per_access;
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = // move src cooridnate along linear dimensions
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]); const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is src compile-time offset
const index_t src_linear_offset =
src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else
// this is src compile-time offset
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#endif
// Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in
// the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
#endif
}).Else([&](auto) {
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
});
}
} }
// type conversion // SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i) for(index_t i = 0; i < long_vector_size; ++i)
{ {
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]); p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
...@@ -434,371 +342,222 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 ...@@ -434,371 +342,222 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
const index_t buffer_offset = i * dst_data_per_access; const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( // dst offset is calculated here, without explicitly separating into
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); // compile-time and per-thread component
const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = // Check dst vector's padding situation, only check the first data in
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]); // this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
}
} }
}); });
#endif });
}
private:
Array<index_t, nDim> mSrcSliceOrigin;
Array<index_t, nDim> mDstSliceOrigin;
};
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v2r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
DstCoordinate dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
: ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
} }
template <class TDesc, class Lengths> // p_src could be any memory space, d_dst must be global memory.
struct IsolateMergedDimLengths // User should make sure p_dst is a block-invariant pointer, because
// buffer_load is used for storing data from regsiter buffer into global-memory.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData,
typename DstData,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void
Run_optimized_dst_address_calculation(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{ {
template <class IDim> using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
__device__ constexpr index_t operator()(IDim idim) const using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
{
return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
}
};
template <class SrcTData, class DstTData>
__device__ void Run(const SrcTData* p_src, DstTData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
SrcTData p_buffer_[buffer_desc.GetElementSpace()];
SrcTData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using src_vector_t = typename vector_type<SrcTData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
constexpr auto src_normal_dim_access_lengths =
src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_merged_dim_access_id_) {
constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){};
constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify(
src_vector_access_dim,
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
const SrcTData* p_src_tmp =
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_normal_dim_access_id_) {
constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){};
constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify(
src_vector_access_dim,
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
constexpr index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
// load vector from src
const src_vector_t vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
// unpack vector into buffer
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
src_vector_access_dim, i);
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
p_buffer[buffer_offset] =
reinterpret_cast<const SrcTData*>(&vector_data)[i];
});
});
});
#else
ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}(
[&](auto src_merged_dim_access_id) {
auto src_merged_dim_data_id = src_merged_dim_access_id;
src_merged_dim_data_id(src_vector_access_dim) =
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
const SrcTData* p_src_tmp = constexpr auto vector_access_dim = Number<VectorAccessDim>{};
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
// these should be compile-time known constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&]( constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
auto src_normal_dim_access_id) {
auto src_normal_dim_data_id = src_normal_dim_access_id; constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
src_normal_dim_data_id(src_vector_access_dim) =
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_normal_offset = constexpr auto long_vector_access_lengths = SliceLengths::Modify(
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// load vector from src // separate linear dimensions from non-linear dimensions
const src_vector_t vector_data = constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]); constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
// unpack vector into buffer static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
for(index_t i = 0; i < SrcDataPerAccess; ++i) long_vector_size == DstDataPerAccess,
{ "Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
auto scalar_id = make_zero_array<index_t, nDim>(); "would drop");
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( // separate steps into linear and non-linear components, accoording to dst tensor
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);
p_buffer[buffer_offset] = constexpr auto nonlinear_long_vector_access_lengths =
reinterpret_cast<const SrcTData*>(&vector_data)[i]; mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);
}
});
});
#endif
}
// copy data from buffer into dst // loop over dst's non-linear dimensions
{ ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
using dst_vector_t = typename vector_type<DstTData, DstDataPerAccess>::MemoryType; auto nonlinear_dim_long_vector_access_id) {
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{}; // calculate step-sizes along dst's nonlinear dimensions
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{}; auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
constexpr auto dst_access_lengths = SliceLengths::Modify( // move dst cooridnate along nonlinear dimensions
dst_vector_access_dim, // this coordinate contains run-time per-thread offset
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;
constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< // loop over dst's linear dimensions
nDim, ford<decltype(linear_long_vector_access_lengths)>{}([&](
IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{}; auto linear_dim_long_vector_access_id) {
constexpr auto dst_normal_dim_access_lengths = // step-sizes along dst's linear dimensions
dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 // buffer to hold a long-vector
// offset w.r.t. merged dimension need to be computed at run-time SrcData p_src_long_vector[long_vector_size];
static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_merged_dim_access_id_) {
constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){}; // zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify( // Loop over VectorAccessDim, and load data from src to the
dst_vector_access_dim, // long-vector buffer.
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access); // If VectorAccessDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be dst's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
DstTData* p_dst_tmp = const index_t buffer_offset = i * src_data_per_access;
p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time // src offset is calculated here, without explicitly separating into
static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&]( // compile-time and per-thread component
auto dst_normal_dim_access_id_) { const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){}; linear_dim_data_steps + scalar_id);
constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify( // Check src vector's padding situation, only check the first data in
dst_vector_access_dim, // this src vector. It's user's responsiblity to make sure all data in
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access); // the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
}
}
dst_vector_t vector_data{}; // SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
// pack vector from buffer for(index_t i = 0; i < long_vector_size; ++i)
static_for<0, DstDataPerAccess, 1>{}([&](auto i) { {
constexpr auto scalar_id = p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
typename uniform_sequence_gen<nDim, 0>::type{}.Modify( }
dst_vector_access_dim, i);
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( // store data from the long-vector buffer to dst
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
reinterpret_cast<DstTData*>(&vector_data)[i] = const index_t buffer_offset = i * dst_data_per_access;
type_convert<DstTData>{}(p_buffer[buffer_offset]);
});
constexpr index_t dst_normal_offset = // move dst cooridnate along linear dimensions
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// write vector into dst #if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data; // this is dst compile-time offset
}); const index_t dst_linear_offset =
}); dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else #else
// offset w.r.t. merged dimension need to be computed at run-time // this is dst compile-time offset
ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&]( const index_t dst_linear_offset =
auto dst_merged_dim_access_id) { dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#endif
auto dst_merged_dim_data_id = dst_merged_dim_access_id;
dst_merged_dim_data_id(dst_vector_access_dim) =
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
DstTData* p_dst_tmp =
p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id) {
auto dst_normal_dim_data_id = dst_normal_dim_access_id;
dst_normal_dim_data_id(dst_vector_access_dim) =
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
dst_vector_t vector_data{};
// pack vector from buffer // Check dst vector's padding situation, only check the first data in
for(index_t i = 0; i < DstDataPerAccess; ++i) // this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
scalar_id(dst_vector_access_dim) = i; #if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>(
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); p_dst,
dst_nonlinear_coord.GetOffset(),
reinterpret_cast<DstTData*>(&vector_data)[i] = dst_linear_offset);
type_convert<DstTData>{}(p_buffer[buffer_offset]); #else
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif
}).Else([&](auto) {
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
} }
}
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// write vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
});
}); });
});
}
__device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
{
#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
return true;
#else
return false;
#endif #endif
}
} }
// T can be Sequence or Array template <typename T, bool PositiveDirection>
template <class T, bool PositiveDirection> __device__ void MoveSrcSliceWindow(const T& step_sizes_,
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>) integral_constant<bool, PositiveDirection>)
{ {
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes; mSrcSliceOrigin += to_array(step_sizes);
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>) __device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{ {
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { static_if<PositiveDirection>{}([&](auto) {
mDstSliceOrigin += step_sizes; mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
} }
private: private:
SrcCoordinate mSrcSliceOrigin; SrcCoord mSrcSliceOrigin;
DstCoordinate mDstSliceOrigin; DstCoord mDstSliceOrigin;
}; };
} // namespace ck } // namespace ck
......
...@@ -4,12 +4,9 @@ ...@@ -4,12 +4,9 @@
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h" #include "hip/hip_fp16.h"
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
namespace ck { namespace ck {
......
...@@ -24,6 +24,21 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) ...@@ -24,6 +24,21 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
return os; return os;
} }
template <class Range>
std::ostream& LogRangeStrided(std::ostream& os, Range&& range, std::string delim, size_t stride)
{
bool first = true;
for(size_t idx=0; idx<range.size(); idx+=stride)
{
if(first)
first = false;
else
os << delim;
os << range[idx];
}
return os;
}
typedef enum { typedef enum {
Half = 0, Half = 0,
Float = 1, Float = 1,
......
...@@ -1000,12 +1000,14 @@ int main(int argc, char* argv[]) ...@@ -1000,12 +1000,14 @@ int main(int argc, char* argv[])
} }
#if CONV_DIRECTION_FWD_DATA // fwd data #if CONV_DIRECTION_FWD_DATA // fwd data
check_error(out_nkhw_host, out_nkhw_device); check_error(out_nkhw_host, out_nkhw_device);
#elif CONV_DIRECTION_BWD_WEIT // bwd wrw //#elif CONV_DIRECTION_BWD_WEIT // bwd wrw
#elif CONV_DIRECTION_BWD_WEIT
check_error(wei_kcyx_host, wei_kcyx); check_error(wei_kcyx_host, wei_kcyx);
#endif #endif
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device : ", out_nkhw.mData, ",") << std::endl; LogRange(std::cout << "out_nkhw_device : ", out_nkhw.mData, ",") << std::endl;
//LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; //LogRangeStrided(std::cout << "out_nkhw_device : ", out_nkhw.mData, ",", 64) << std::endl;
LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
#if 0 #if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment