"vscode:/vscode.git/clone" did not exist on "475b21b4f76924ef812f90911dece401bd20b502"
Commit b37cb71f authored by Wen-Heng (Jack) Chung's avatar Wen-Heng (Jack) Chung
Browse files

Enable bwd wrw

parent c5143bca
......@@ -2,465 +2,57 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace ck {
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v1
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static constexpr index_t nOriginalDimSrc =
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
static constexpr index_t nOriginalDimDst =
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
// per-thread offset
index_t mThreadSrcOffset;
index_t mThreadDstOffset;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array<index_t, nDim> mThreadSrcPartialOffsets;
Array<index_t, nDim> mThreadDstPartialOffsets;
// multi-id of original tensor
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
Array<index_t, nDim> dst_block_data_id_begin)
{
// check NDim consistency
static_assert(
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong");
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong!");
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
// additional check for merged dimension
static_for<0, nDim, 1>{}([&](auto IDim_) {
// src
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_src =
SrcDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
// dst
static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_dst =
DstDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
// original multi-id
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
src_block_data_id_begin + thread_data_id_begin);
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
dst_block_data_id_begin + thread_data_id_begin);
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
});
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim);
constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
});
// complete offset
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr auto GetRegisterBufferDescriptor()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
}
__device__ static constexpr index_t GetRegisterBufferSize()
{
return GetRegisterBufferDescriptor().GetElementSpace();
}
template <class TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
decltype(thread_buffer_desc),
SubLengths,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcDataPerAccess,
1>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
});
}
template <class TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
DstDesc,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>(
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
});
}
template <class TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{
constexpr auto IDim = Number<IDim_>{};
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
// calculate new partial original multi-id
auto old_src_partial_original_id =
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
auto new_src_partial_original_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
constexpr auto IDimOriginal = src_partial_original_dims[I];
mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
const index_t new_src_partial_offset =
src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto) {
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
static_if<PositiveDirection>{}([&](auto fwd) {
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
}).Else([&](auto fwd) {
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
});
});
}
template <class T, bool PositiveDirection>
__device__ void
MoveSrcSlicingWindow(T step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
static_for<0, nDim, 1>{}([&](auto idim) {
if(step_sizes[idim] != 0)
{
MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
}
});
}
};
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
typename BlockSrcDesc,
typename BlockDstDesc,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v2
struct BlockwiseGenericTensorSliceCopy_v4
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
DstCoordinate dst_block_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize(),
static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
const Index& dst_block_slice_origin)
{
static_assert(nDim == BlockSrcDesc::GetNumOfDimension() &&
nDim == BlockDstDesc::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// map threads to cluster
constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths");
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
......@@ -469,438 +61,166 @@ struct BlockwiseGenericTensorSliceCopy_v2
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
}
__device__ static constexpr index_t GetRegisterBufferSize()
__device__ static constexpr index_t GetThreadBufferSize()
{
return RegisterBufferDesc::GetElementSpace();
return ThreadBufferDesc::GetElementSpace();
}
template <class TData>
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
template <typename BlockSrcData,
typename ThreadBufferData,
AddressSpace BlockSrcAddressSpace,
AddressSpace ThreadBufferAddressSpace>
__device__ void
RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer,
integral_constant<AddressSpace, BlockSrcAddressSpace>,
integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
{
constexpr auto block_src_address_space =
integral_constant<AddressSpace, BlockSrcAddressSpace>{};
constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseLoad.Run_optimized_src_address_calculation(
p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
}
else
{
mThreadwiseLoad.Run(
p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
}
}
template <typename BlockSrcData, typename ThreadBufferData>
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
RunLoadThreadBuffer(
p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
}
template <typename ThreadBufferData,
typename BlockDstData,
AddressSpace ThreadBufferAddressSpace,
AddressSpace BlockDstAddressSpace>
__device__ void
RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, ThreadBufferAddressSpace>,
integral_constant<AddressSpace, BlockDstAddressSpace>) const
{
constexpr auto thread_buffer_address_space =
integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
constexpr auto block_dst_address_space =
integral_constant<AddressSpace, BlockDstAddressSpace>{};
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseStore.Run_optimized_dst_address_calculation(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
}
else
{
mThreadwiseStore.Run(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
}
}
template <typename ThreadBufferData, typename BlockDstData>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const
{
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
RunStoreThreadBuffer(
p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
}
template <typename BlockSrcData,
typename BlockDstData,
AddressSpace BlockSrcAddressSpace,
AddressSpace BlockDstAddressSpace>
__device__ void
Run(const BlockSrcData* p_block_src,
BlockDstData* p_block_dst,
integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
{
mThreadwiseLoad.Run(p_src, p_buffer);
}
BlockSrcData p_thread_buffer[GetThreadBufferSize()];
template <class TData>
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{
mThreadwiseStore.Run(p_buffer, p_dst);
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
RunLoadThreadBuffer(
p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
// if there is type conversion, it's done during store
RunStoreThreadBuffer(
p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
}
template <class TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
template <typename BlockSrcData, typename BlockDstData>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseStore.Run(p_buffer, p_dst);
Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSlicingWindow(T step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
MoveSrcSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveDstSlicingWindow(T step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
MoveDstSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
}
private:
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
RegisterBufferDesc,
SrcCoordinate,
NormalTensorCoordinate<RegisterBufferDesc>,
SubLengths,
SrcDimAccessOrder,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseStore =
ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
DstDesc,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
SubLengths,
DstDimAccessOrder,
DstDimAccessOrder,
DstVectorAccessDim,
DstVectorAccessDim,
1,
DstDataPerAccess>;
using ThreadBufferDesc = decltype(make_native_tensor_descriptor_packed(ThreadSliceLengths{}));
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<BlockSrcDesc,
ThreadBufferDesc,
ThreadSliceLengths,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<ThreadBufferDesc,
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore;
};
// this will be deprecated
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst
// For now, only support SubLengths[...] == 1 on a merged dimension
template <index_t BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class DataClusterLengths,
class ThreadClusterArrangeOrder,
class SrcAccessOrder,
class DstAccessOrder,
index_t SrcDataPerRead,
index_t DstDataPerWrite>
struct BlockwiseGenericTensorSliceCopy_v1_deprecated
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static constexpr index_t nOriginalDimSrc =
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
static constexpr index_t nOriginalDimDst =
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
// per-thread offset
index_t mThreadSrcOffset;
index_t mThreadDstOffset;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array<index_t, nDim> mThreadSrcPartialOffsets;
Array<index_t, nDim> mThreadDstPartialOffsets;
// multi-id of original tensor
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1_deprecated(
Array<index_t, nDim> src_block_data_multi_id_begin,
Array<index_t, nDim> dst_block_data_multi_id_begin)
{
// check NDim consistency
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
"wrong");
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcAccessOrder>::value &&
is_valid_sequence_map<DstAccessOrder>::value,
"wrong!");
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! block size doesn't match with thread cluster size.");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into sub-tensor");
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
// for now, only support SubLengths == 1 on a merged dimension that constains
// multiple original dimensions
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
static_assert(SubLengths::Get(IDim) == 1 ||
(!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
!DstDesc::ContainMultipleOriginalDimensions(IDim)),
"wrong! only support Sub-Length == 1 on a merged dimension");
});
// calculate mThreadSrcOffset, mThreadDstOffset
const auto thread_cluster_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_multi_id =
reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
// original multi-id
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
src_block_data_multi_id_begin + thread_data_multi_id_begin);
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
dst_block_data_multi_id_begin + thread_data_multi_id_begin);
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr index_t idim = IDim;
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets(idim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
});
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr index_t idim = IDim;
constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim);
constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets(idim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
});
// complete offset
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr index_t GetRegisterClipboardSize()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims;
constexpr auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#else
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset =
SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#endif
threadwise_generic_tensor_slice_copy_v1_deprecated(SrcDesc{},
p_src + src_offset +
mThreadSrcOffset,
make_zero_array<index_t, nDim>(),
thread_tensor_desc,
p_clipboard + clipboard_offset,
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
SrcAccessOrder{},
Number<SrcDataPerRead>{});
});
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
constexpr index_t dst_offset =
DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#endif
threadwise_generic_tensor_slice_copy_v1_deprecated(thread_tensor_desc,
p_clipboard + clipboard_offset,
make_zero_array<index_t, nDim>(),
DstDesc{},
p_dst + dst_offset +
mThreadDstOffset,
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
DstAccessOrder{},
Number<DstDataPerWrite>{});
});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
Float p_clipboard[GetRegisterClipboardSize()];
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{
constexpr auto IDim = Number<IDim_>{};
constexpr index_t idim = IDim;
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
// calculate new partial original multi-id
auto old_src_partial_original_multi_id =
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
auto new_src_partial_original_multi_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_multi_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = src_partial_original_dims.Get(I);
mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[idim];
const index_t new_src_partial_offset =
src_partial_original_desc.GetOffsetFromMultiIndex(
new_src_partial_original_multi_id);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets(idim) = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto) {
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
static_if<PositiveDirection>{}([&](auto fwd) {
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(idim_original) += StepSize;
mThreadSrcPartialOffsets(idim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
}).Else([&](auto fwd) {
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(idim_original) -= StepSize;
mThreadSrcPartialOffsets(idim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
});
});
}
};
} // namespace ck
#endif
......@@ -7,98 +7,160 @@
namespace ck {
template <class Float, class Matrix>
template <typename Float, class Matrix>
__device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
{
for(index_t i = 0; i < Matrix::NRow(); ++i)
{
for(index_t j = 0; j < Matrix::NCol(); ++j)
{
const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
const index_t id = Matrix::CalculateOffset(i, j);
p_thread[id] = Float(0);
}
}
}
template <class Float,
class SrcMatrix,
class DstMatrix,
index_t NRow,
index_t NCol,
index_t DataPerRead>
__device__ void threadwise_matrix_copy(SrcMatrix,
const Float* __restrict__ p_src,
DstMatrix,
Float* __restrict__ p_dst,
Sequence<NRow, NCol>,
Number<DataPerRead>)
template <typename SrcMatrix,
typename DstMatrix,
index_t NSliceRow,
index_t NSliceCol,
index_t DataPerAccess>
struct ThreadwiseMatrixSliceCopy
{
static_assert(NCol % DataPerRead == 0, "wrong! should be NCol % == DataPerRead == 0");
__device__ constexpr ThreadwiseMatrixSliceCopy()
{
static_assert(SrcMatrix::RowStride() % DataPerAccess == 0 &&
DstMatrix::RowStride() % DataPerAccess == 0,
"wrong! wrong alignment");
static_assert(NSliceCol % DataPerAccess == 0,
"wrong! should be NSliceCol % DataPerAccess == 0");
}
constexpr auto src_mtx = SrcMatrix{};
constexpr auto dst_mtx = DstMatrix{};
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
for(index_t i = 0; i < NRow; ++i)
template <typename Data>
__device__ static void Run(const Data* p_src, Data* p_dst)
{
for(index_t j = 0; j < NCol; j += DataPerRead)
using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
for(index_t i = 0; i < NSliceRow; ++i)
{
const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j);
const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j);
for(index_t j = 0; j < NSliceCol; j += DataPerAccess)
{
const index_t src_index = SrcMatrix::CalculateOffset(i, j);
const index_t dst_index = DstMatrix::CalculateOffset(i, j);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
}
}
}
}
};
template <class MatrixA,
class MatrixB,
class MatrixC,
bool TransA,
bool TransB,
bool TransC,
class FloatA,
class FloatB,
class FloatC>
__device__ void threadwise_gemm(MatrixA,
integral_constant<bool, TransA>,
const FloatA* __restrict__ p_a_thread,
MatrixB,
integral_constant<bool, TransB>,
const FloatB* __restrict__ p_b_thread,
MatrixC,
integral_constant<bool, TransC>,
FloatC* __restrict__ p_c_thread)
// C += transpose(A) * B
// Element of matrix can be vectorized data
template <typename MatrixA, typename MatrixB, typename MatrixC>
struct ThreadwiseGemmTransANormalBNormalC
{
static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) {
constexpr auto a_mtx = MatrixA{};
constexpr auto b_mtx = MatrixB{};
constexpr auto c_mtx = MatrixC{};
__device__ constexpr ThreadwiseGemmTransANormalBNormalC()
{
static_assert(MatrixA::NRow() == MatrixB::NRow() && MatrixA::NCol() == MatrixC::NRow() &&
MatrixB::NCol() == MatrixC::NCol(),
"wrong!");
}
constexpr index_t M = c_mtx.NRow();
constexpr index_t N = c_mtx.NCol();
constexpr index_t K = a_mtx.NRow(); // A is transposed
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
constexpr index_t M = MatrixC::NRow();
constexpr index_t N = MatrixC::NCol();
constexpr index_t K = MatrixA::NRow(); // A is transposed
for(index_t k = 0; k < K; ++k)
{
for(index_t i = 0; i < M; ++i)
for(index_t m = 0; m < M; ++m)
{
for(index_t j = 0; j < N; ++j)
for(index_t n = 0; n < N; ++n)
{
const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed
const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j);
const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j);
const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
const index_t bindex = MatrixB::CalculateOffset(k, n);
const index_t cindex = MatrixC::CalculateOffset(m, n);
p_c_thread[cindex] += math::inner_product_with_conversion<FloatC>{}(
p_a_thread[aindex], p_b_thread[bindex]);
p_c[cindex] +=
inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
}
}
}
}).Else([&](auto fwd) {
// not implemented
static_assert(fwd(false), "wrong! support for this config is not implemented");
});
}
}
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
constexpr index_t M = MatrixC::NRow();
constexpr index_t N = MatrixC::NCol();
constexpr index_t K = MatrixA::NRow(); // A is transposed
static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
for(index_t k = 0; k < K; ++k)
{
for(index_t m = 0; m < M; ++m)
{
const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
static_if<N == 2>{}([&](auto) {
const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
__outer_product_1x2(
p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
});
static_if<N == 4>{}([&](auto) {
const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
const index_t bindex_2 = MatrixB::CalculateOffset(k, 2);
const index_t bindex_3 = MatrixB::CalculateOffset(k, 3);
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
__outer_product_1x4(p_a[aindex],
p_b[bindex_0],
p_b[bindex_1],
p_b[bindex_2],
p_b[bindex_3],
p_c[cindex_0],
p_c[cindex_1],
p_c[cindex_2],
p_c[cindex_3]);
});
}
}
}
#endif
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
(is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
(is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
static_if<has_amd_asm>{}([&](auto fwd) {
Run_amd_asm(p_a, p_b, fwd(p_c));
}).Else([&](auto) { Run_source(p_a, p_b, p_c); });
#else
Run_source(p_a, p_b, p_c);
#endif
}
};
} // namespace ck
#endif
......@@ -2,336 +2,218 @@
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
#include "float_types.h"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
namespace ck {
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r1
struct ThreadwiseGenericTensorSliceCopy_v4r2
{
static constexpr index_t nDim = SliceLengths::GetSize();
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
using SrcCoord = typename TensorCoordinate<SrcDesc>::type;
using DstCoord = typename TensorCoordinate<DstDesc>::type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
const Index& dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
nDim == DimAccessOrder::Size(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_assert(is_valid_sequence_map<DimAccessOrder>{}, "wrong! map is not valid");
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_assert(
SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
"wrong! cannot evenly divide");
// TODO:: sanity-check if vectorized memory access is allowed on src and dst
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
: ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
: ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
__device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
__device__ void SetDstSliceOrigin(DstCoord dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <class SrcData, class DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
template <typename SrcData,
typename DstData,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void Run(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
SrcData p_src_buffer_[buffer_desc.GetElementSpace()];
SrcData* p_src_buffer = p_src_buffer_;
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
// copy data from src into src buffer
{
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
constexpr auto src_data_begin_id = src_access_id.Modify(
src_vector_access_dim,
src_access_id[src_vector_access_dim] * src_data_per_access);
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) {
// load vector from src
const src_vector_t src_vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
// data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// unpack vector into buffer
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(src_vector_access_dim,
i);
// buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size];
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
p_src_buffer[buffer_offset] =
reinterpret_cast<const SrcData*>(&src_vector_data)[i];
});
});
#else
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
auto src_data_begin_id = src_access_id;
src_data_begin_id(src_vector_access_dim) =
src_access_id[src_vector_access_dim] * src_data_per_access;
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
const index_t buffer_offset = i * src_data_per_access;
// load vector from src
const src_vector_t src_vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
// Check src vector's padding situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_src_buffer[buffer_offset] =
reinterpret_cast<const SrcData*>(&src_vector_data)[i];
}
});
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
fwd(p_src), src_coord.GetOffset(), 0);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
#endif
}
// copy data from buffer to dst
{
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
constexpr auto dst_data_begin_id = dst_access_id.Modify(
dst_vector_access_dim,
dst_access_id[dst_vector_access_dim] * dst_data_per_access);
dst_vector_t dst_vector_data;
// pack vector from buffer and type conversion
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(dst_vector_access_dim,
i);
}).Else([&](auto) {
// src can be all kinds of memory-space.
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
});
}
}
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
// SrcData to DstData type conversion is done here
reinterpret_cast<DstData*>(&dst_vector_data)[i] =
type_convert<DstData>{}(p_src_buffer[buffer_offset]);
});
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
// store vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = dst_vector_data;
});
#else
ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
auto dst_data_begin_id = dst_access_id;
dst_data_begin_id(dst_vector_access_dim) =
dst_access_id[dst_vector_access_dim] * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
dst_vector_t dst_vector_data;
const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
// pack vector from buffer and type conversion
for(index_t i = 0; i < DstDataPerAccess; ++i)
// Check dst vector's padding situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
// SrcData to DstData type conversion is done here
reinterpret_cast<DstData*>(&dst_vector_data)[i] =
type_convert<DstData>{}(p_src_buffer[buffer_offset]);
}
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = dst_vector_data;
});
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst),
dst_coord.GetOffset(),
0);
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif
}
}
private:
Array<index_t, nDim> mSrcSliceOrigin;
Array<index_t, nDim> mDstSliceOrigin;
};
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template <class SrcDesc,
class DstDesc,
class SliceLengths,
class DimAccessOrder,
index_t VectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r2
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2(
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == DimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
static_assert(
SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto) {
// dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
}
}
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2()
: ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
template <typename SrcData, typename DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
{
}
constexpr auto generic_address_space =
integral_constant<AddressSpace, AddressSpace::generic>{};
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
Run(p_src, p_dst, generic_address_space, generic_address_space);
}
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
// Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions
template <index_t... Lengths, index_t... Mask>
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
{
mDstSliceOrigin = dst_slice_origin;
return Sequence<(Mask ? Lengths : 1)...>{};
}
template <class SrcData, class DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
// p_src must be global-memory, p_dst can be any memory-space.
// User should make sure p_src is a block-invariant pointer, because
// buffer_load is used for loading from global-memory into register buffer.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData,
typename DstData,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void
Run_optimized_src_address_calculation(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
......@@ -346,81 +228,107 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) {
// separate linear dimensions from non-linear dimensions
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
// data id w.r.t slicing-window
constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
long_vector_size == SrcDataPerAccess,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
"would drop");
// buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size];
DstData p_dst_long_vector[long_vector_size];
// separate steps into linear and non-linear components, accoording to src tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
// load data from src to the long-vector buffer
static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
vector_access_dim, i * src_data_per_access);
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
constexpr index_t buffer_offset = i * src_data_per_access;
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
});
// loop over src's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// type conversion
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstType>{}(p_src_long_vector[i]);
}
// calculate step-sizes along src's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// store data from the long-vector buffer to dst
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
vector_access_dim, i * dst_data_per_access);
// move src cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
constexpr index_t buffer_offset = i * dst_data_per_access;
// loop over src's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
});
#else
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
[&](auto long_vector_access_id) {
// data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// step-sizes along src's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size];
DstData p_dst_long_vector[long_vector_size];
// load data from src to the long-vector buffer
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
// Loop over VectorAccessDim, and load data from src to the
// long-vector buffer.
// If VectorAccessDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be src's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
const index_t buffer_offset = i * src_data_per_access;
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
// move src cooridnate along linear dimensions
const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is src compile-time offset
const index_t src_linear_offset =
src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else
// this is src compile-time offset
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#endif
// Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in
// the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
#endif
}).Else([&](auto) {
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
});
}
}
// type conversion
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
......@@ -434,371 +342,222 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
// dst offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
// Check dst vector's padding situation, only check the first data in
// this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
}
}
});
#endif
}
private:
Array<index_t, nDim> mSrcSliceOrigin;
Array<index_t, nDim> mDstSliceOrigin;
};
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v2r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
DstCoordinate dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
: ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
});
}
template <class TDesc, class Lengths>
struct IsolateMergedDimLengths
// p_src could be any memory space, d_dst must be global memory.
// User should make sure p_dst is a block-invariant pointer, because
// buffer_load is used for storing data from regsiter buffer into global-memory.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData,
typename DstData,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void
Run_optimized_dst_address_calculation(const SrcData* p_src,
DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const
{
template <class IDim>
__device__ constexpr index_t operator()(IDim idim) const
{
return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
}
};
template <class SrcTData, class DstTData>
__device__ void Run(const SrcTData* p_src, DstTData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
SrcTData p_buffer_[buffer_desc.GetElementSpace()];
SrcTData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using src_vector_t = typename vector_type<SrcTData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
constexpr auto src_normal_dim_access_lengths =
src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_merged_dim_access_id_) {
constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){};
constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify(
src_vector_access_dim,
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
const SrcTData* p_src_tmp =
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_normal_dim_access_id_) {
constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){};
constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify(
src_vector_access_dim,
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
constexpr index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
// load vector from src
const src_vector_t vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
// unpack vector into buffer
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
src_vector_access_dim, i);
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
p_buffer[buffer_offset] =
reinterpret_cast<const SrcTData*>(&vector_data)[i];
});
});
});
#else
ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}(
[&](auto src_merged_dim_access_id) {
auto src_merged_dim_data_id = src_merged_dim_access_id;
src_merged_dim_data_id(src_vector_access_dim) =
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
const SrcTData* p_src_tmp =
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
// these should be compile-time known
ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_normal_dim_access_id) {
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
auto src_normal_dim_data_id = src_normal_dim_access_id;
src_normal_dim_data_id(src_vector_access_dim) =
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
const index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// load vector from src
const src_vector_t vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
// separate linear dimensions from non-linear dimensions
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
long_vector_size == DstDataPerAccess,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
"would drop");
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
// separate steps into linear and non-linear components, accoording to dst tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);
p_buffer[buffer_offset] =
reinterpret_cast<const SrcTData*>(&vector_data)[i];
}
});
});
#endif
}
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);
// copy data from buffer into dst
{
using dst_vector_t = typename vector_type<DstTData, DstDataPerAccess>::MemoryType;
// loop over dst's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
// calculate step-sizes along dst's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
// move dst cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;
constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
// loop over dst's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
constexpr auto dst_normal_dim_access_lengths =
dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
// step-sizes along dst's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_merged_dim_access_id_) {
// buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size];
constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){};
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify(
dst_vector_access_dim,
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
// Loop over VectorAccessDim, and load data from src to the
// long-vector buffer.
// If VectorAccessDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be dst's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
DstTData* p_dst_tmp =
p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
const index_t buffer_offset = i * src_data_per_access;
// offset w.r.t. normal dimension can be computed at compile-time
static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id_) {
constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){};
// src offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify(
dst_vector_access_dim,
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
// Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in
// the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
}
}
dst_vector_t vector_data{};
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
// pack vector from buffer
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
dst_vector_access_dim, i);
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
reinterpret_cast<DstTData*>(&vector_data)[i] =
type_convert<DstTData>{}(p_buffer[buffer_offset]);
});
const index_t buffer_offset = i * dst_data_per_access;
constexpr index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// move dst cooridnate along linear dimensions
const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// write vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
});
});
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is dst compile-time offset
const index_t dst_linear_offset =
dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_merged_dim_access_id) {
auto dst_merged_dim_data_id = dst_merged_dim_access_id;
dst_merged_dim_data_id(dst_vector_access_dim) =
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
DstTData* p_dst_tmp =
p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id) {
auto dst_normal_dim_data_id = dst_normal_dim_access_id;
dst_normal_dim_data_id(dst_vector_access_dim) =
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
dst_vector_t vector_data{};
// this is dst compile-time offset
const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#endif
// pack vector from buffer
for(index_t i = 0; i < DstDataPerAccess; ++i)
// Check dst vector's padding situation, only check the first data in
// this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
reinterpret_cast<DstTData*>(&vector_data)[i] =
type_convert<DstTData>{}(p_buffer[buffer_offset]);
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst,
dst_nonlinear_coord.GetOffset(),
dst_linear_offset);
#else
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif
}).Else([&](auto) {
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
}
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// write vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
});
}
});
});
}
__device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
{
#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
return true;
#else
return false;
#endif
}
}
// T can be Sequence or Array
template <class T, bool PositiveDirection>
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes;
mSrcSliceOrigin += to_array(step_sizes);
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <class T, bool PositiveDirection>
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
template <typename T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) {
mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
}
private:
SrcCoordinate mSrcSliceOrigin;
DstCoordinate mDstSliceOrigin;
SrcCoord mSrcSliceOrigin;
DstCoord mDstSliceOrigin;
};
} // namespace ck
......
......@@ -4,12 +4,9 @@
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
namespace ck {
......
......@@ -24,6 +24,21 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
return os;
}
template <class Range>
std::ostream& LogRangeStrided(std::ostream& os, Range&& range, std::string delim, size_t stride)
{
bool first = true;
for(size_t idx=0; idx<range.size(); idx+=stride)
{
if(first)
first = false;
else
os << delim;
os << range[idx];
}
return os;
}
typedef enum {
Half = 0,
Float = 1,
......
......@@ -1000,12 +1000,14 @@ int main(int argc, char* argv[])
}
#if CONV_DIRECTION_FWD_DATA // fwd data
check_error(out_nkhw_host, out_nkhw_device);
#elif CONV_DIRECTION_BWD_WEIT // bwd wrw
//#elif CONV_DIRECTION_BWD_WEIT // bwd wrw
#elif CONV_DIRECTION_BWD_WEIT
check_error(wei_kcyx_host, wei_kcyx);
#endif
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device : ", out_nkhw.mData, ",") << std::endl;
//LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
//LogRangeStrided(std::cout << "out_nkhw_device : ", out_nkhw.mData, ",", 64) << std::endl;
LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment