Commit 68ea43b1 authored by Chao Liu's avatar Chao Liu
Browse files

add vector load and access order to threadwise copy v1r3

parent 5fdccfce
......@@ -492,7 +492,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
0,
GemmABlockTransferSrcScalarPerVector_GemmK,
GemmABlockTransferDstScalarPerVector_GemmM,
true, // move back src coordinate after threadwise copy
false, // don't move back src coordinate after threadwise copy
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
Sequence<0, 1>,
......
......@@ -502,11 +502,11 @@ struct DynamicMerge
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex_1(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& /* idx_up_new */,
Number<Hack>) const
__host__ __device__ void UpdateLowerIndex_1a(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& /* idx_up_new */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
......@@ -640,6 +640,148 @@ struct DynamicMerge
}
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex_1b(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& /* idx_up_new */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t negative_idx_low_tmp = borrow - idx_low[i];
bool do_borrow = negative_idx_low_tmp > idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) -= borrow;
borrow = do_borrow ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
idx_low += idx_diff_low;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
......@@ -705,11 +847,15 @@ struct DynamicMerge
do_carry = idx_low_tmp >= low_lengths_[i];
#if 0
// TODO: use exec-mask inline asm
if(do_carry)
{
idx_diff_low(i) -= low_lengths_[i];
}
#else
idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
#endif
idx_low(i) += idx_diff_low[i];
});
......@@ -733,11 +879,15 @@ struct DynamicMerge
do_borrow = idx_low_tmp < 0;
#if 0
// TODO: use exec-mask inline asm
if(do_borrow)
{
idx_diff_low(i) += low_lengths_[i];
}
#else
idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
#endif
idx_low(i) += idx_diff_low[i];
});
......@@ -765,8 +915,10 @@ struct DynamicMerge
const UpIdx& idx_up_new,
Number<Hack>) const
{
#if 1
UpdateLowerIndex_1(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#if 0
UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#elif 0
UpdateLowerIndex_1b(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#else
UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#endif
......
......@@ -4,6 +4,7 @@
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
#include "ConstantMatrixDescriptor.hpp"
......@@ -364,9 +365,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
#if 0 // debug
constexpr auto c_m0_m1_n0_n1_thread_desc =
make_dynamic_naive_tensor_descriptor_packed<4>(
make_multi_index(MRepeat, MPerThread, NRepeat, NPerThread));
#else
constexpr auto c_m0_m1_n0_n1_thread_desc = make_native_tensor_descriptor_packed(
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>{});
#endif
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
......@@ -379,6 +385,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col;
#if 0
ThreadwiseDynamicTensorSliceTransfer_v1r2<
AccFloat,
Float,
......@@ -404,6 +411,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
n_thread_data_on_global % N1))
.Run_hack(
c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
#else
ThreadwiseDynamicTensorSliceTransfer_v1r3<
AccFloat,
Float,
decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc),
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryDataOperation,
1,
true,
true>(c_m0_m1_n0_n1_global_desc,
make_multi_index(m_thread_data_on_global / M1,
m_thread_data_on_global % M1,
n_thread_data_on_global / N1,
n_thread_data_on_global % N1))
.Run_hack(p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
#endif
}
}
......
......@@ -67,7 +67,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
}
__device__ void
Run(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
Run_hack(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
{
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
......@@ -152,7 +152,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
}
else if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
#if 0
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
......@@ -192,6 +193,48 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#else
// hack for output tensor
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
const auto dst_step_0_0_0_p1 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 0, 1), Sequence<0, 0, 1, 0, 0>{});
const auto dst_step_0_0_0_m1 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 0, -1), Sequence<0, 0, 2, 0, 0>{});
const auto dst_step_0_0_p1_0 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 1, 0), Sequence<0, 0, 1, 0, 0>{});
const auto dst_step_0_0_m1_0 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, -1, 0), Sequence<0, 0, 2, 0, 0>{});
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#endif
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
......@@ -336,331 +379,398 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
return back_step;
}
__device__ void
Run_hack(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
// is it OK to construct a new step every time?
const auto src_slice_origin_step =
make_dynamic_tensor_coordinate_step(src_desc, src_slice_origin_step_idx);
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_slice_origin_step);
}
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// is it OK to construct a new step every time?
const auto dst_slice_origin_step =
make_dynamic_tensor_coordinate_step(dst_desc, dst_slice_origin_step_idx);
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_slice_origin_step);
}
#pragma unroll
for(index_t iter0 = 0; iter0 < Len0; ++iter0)
{
#pragma unroll
for(index_t iter1 = 0; iter1 < Len1; ++iter1)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize(),
p_dst,
dst_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_),
dst_desc.GetElementSpaceSize());
private:
SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_;
};
// move dim1 iterator
if(iter1 < Len1 - 1)
{
bool forward_dim1 = (iter0 % 2 == 0);
// this version is less likely to have scratch memory issue, due to:
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename DimAccessOrder,
index_t DstVectorDim,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t DstScalarStrideInVector,
bool SrcResetCoordinateAfterRun,
bool DstResetCoordinateAfterRun>
struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
if(forward_dim1)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_p1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_m1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1);
}
}
}
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
// move dim0 iterator
if(iter0 < Len0 - 1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
}
}
}
else if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
#if 0
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3(
const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
: dst_slice_origin_coord_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx))
{
}
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3()
: ThreadwiseDynamicTensorSliceTransfer_v1r3(DstDesc{}, make_zero_multi_index<nDim>())
{
}
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_slice_origin_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
const auto dst_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
const auto dst_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
__device__ void Run_hack(const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
{
// hardcoded for 4D
// TODO implemente N-D
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4,
"wrong! hardcoded for 4D tensor");
const auto dst_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
const auto dst_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
constexpr auto dst_scalar_per_access = [&]() {
Index dst_scalar_per_access;
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == DstVectorDim)
{
dst_scalar_per_access(i) = DstScalarPerVector;
}
else
{
dst_scalar_per_access(i) = 1;
}
});
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#else
// hack for output tensor
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
return dst_scalar_per_access;
}();
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
constexpr auto dst_scalar_step_in_vector = [&]() {
Index dst_scalar_step_in_vector;
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == DstVectorDim)
{
dst_scalar_step_in_vector(i) = 1;
}
else
{
dst_scalar_step_in_vector(i) = 0;
}
});
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
return dst_scalar_step_in_vector;
}();
const auto dst_step_0_0_0_p1 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 0, 1), Sequence<0, 0, 1, 0, 0>{});
const auto dst_step_0_0_0_m1 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 0, -1), Sequence<0, 0, 2, 0, 0>{});
constexpr auto access_lengths = [&]() {
Index access_lengths;
const auto dst_step_0_0_p1_0 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 1, 0), Sequence<0, 0, 1, 0, 0>{});
const auto dst_step_0_0_m1_0 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, -1, 0), Sequence<0, 0, 2, 0, 0>{});
static_for<0, nDim, 1>{}(
[&](auto i) { access_lengths(i) = SliceLengths{}[i] / dst_scalar_per_access[i]; });
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
return access_lengths;
}();
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#if 0
const auto dst_forward_steps =
make_tuple(make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, 1, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, 0, 1) * dst_scalar_per_access),
const auto dst_backward_steps =
make_tuple(make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(-1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, -1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, -1, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, 0, -1) * dst_scalar_per_access));
#else
// hack for NKHW output tensor
const auto dst_forward_steps =
make_tuple(make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step_hack(dst_desc,
make_multi_index(0, 0, 1, 0) *
dst_scalar_per_access,
Sequence<0, 0, 1, 0, 0>{}),
make_dynamic_tensor_coordinate_step_hack(dst_desc,
make_multi_index(0, 0, 0, 1) *
dst_scalar_per_access,
Sequence<0, 0, 1, 0, 0>{}));
const auto dst_backward_steps =
make_tuple(make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(-1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, -1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step_hack(dst_desc,
make_multi_index(0, 0, -1, 0) *
dst_scalar_per_access,
Sequence<0, 0, 2, 0, 0>{}),
make_dynamic_tensor_coordinate_step_hack(dst_desc,
make_multi_index(0, 0, 0, -1) *
dst_scalar_per_access,
Sequence<0, 0, 2, 0, 0>{}));
#endif
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
constexpr index_t Len2 = SliceLengths{}[2];
constexpr index_t Len3 = SliceLengths{}[3];
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
#pragma unroll
for(index_t iter0 = 0; iter0 < Len0; ++iter0)
{
#pragma unroll
for(index_t iter1 = 0; iter1 < Len1; ++iter1)
{
#pragma unroll
for(index_t iter2 = 0; iter2 < Len2; ++iter2)
{
#pragma unroll
for(index_t iter3 = 0; iter3 < Len3; ++iter3)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize(),
p_dst,
dst_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_),
dst_desc.GetElementSpaceSize());
index_t counter = 0;
// move dim1 iterator
if(iter3 < Len3 - 1)
{
bool forward_dim3 = (iter2 % 2 == 0);
// loop over dim0
static_for<0,
SliceLengths{}[DimAccessOrder{}[I0]],
dst_scalar_per_access[DimAccessOrder{}[I0]]>{}([&](auto iter0) {
constexpr index_t i0 = iter0;
if(forward_dim3)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_0_p1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_0_m1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_0_m1);
}
}
}
constexpr bool forward_dim1 =
(iter0 / dst_scalar_per_access[DimAccessOrder{}[I0]]) % 2 == 0;
// move dim1 iterator
if(iter2 < Len2 - 1)
// loop over dim1
static_for<0,
SliceLengths{}[DimAccessOrder{}[I1]],
dst_scalar_per_access[DimAccessOrder{}[I1]]>{}([&](auto iter1) {
constexpr index_t i1 =
forward_dim1 ? iter1
: SliceLengths{}[DimAccessOrder{}[I1]] -
dst_scalar_per_access[DimAccessOrder{}[I1]] - iter1;
constexpr bool forward_dim2 =
((iter0 / dst_scalar_per_access[DimAccessOrder{}[I0]]) *
access_lengths[DimAccessOrder{}[I1]] +
(iter1 / dst_scalar_per_access[DimAccessOrder{}[I1]])) %
2 ==
0;
// loop over dim2
static_for<0,
SliceLengths{}[DimAccessOrder{}[I2]],
dst_scalar_per_access[DimAccessOrder{}[I2]]>{}([&](auto iter2) {
constexpr index_t i2 =
forward_dim2 ? iter2
: SliceLengths{}[DimAccessOrder{}[I2]] -
dst_scalar_per_access[DimAccessOrder{}[I2]] - iter2;
constexpr bool forward_dim3 =
(((iter0 / dst_scalar_per_access[DimAccessOrder{}[I0]]) *
access_lengths[DimAccessOrder{}[I1]] +
(iter1 / dst_scalar_per_access[DimAccessOrder{}[I1]])) *
access_lengths[DimAccessOrder{}[I2]] +
(iter2 / dst_scalar_per_access[DimAccessOrder{}[I2]])) %
2 ==
0;
// loop over dim3
static_for<0,
SliceLengths{}[DimAccessOrder{}[I3]],
dst_scalar_per_access[DimAccessOrder{}[I3]]>{}([&](auto iter3) {
constexpr index_t i3 =
forward_dim3 ? iter3
: SliceLengths{}[DimAccessOrder{}[I3]] -
dst_scalar_per_access[DimAccessOrder{}[I3]] - iter3;
// do work
// hardcoding for buffer_store
// TODO refactor transfer_data() to encapsulate this
static_assert(SrcAddressSpace == AddressSpace::Vgpr &&
DstAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_store");
using DstVectorType =
typename vector_type<DstData, DstScalarPerVector>::MemoryType;
vector_type<DstData, DstScalarPerVector> dst_vector;
// this is hardcoded for src that has compile-time tensor descriptor
static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
// hack: assume src_slice_origin_idx is 0
constexpr index_t src_offset = SrcDesc::CalculateOffset(
container_reorder_given_old2new(make_multi_index(i0, i1, i2, i3),
DimAccessOrder{}) +
i * dst_scalar_step_in_vector);
dst_vector(i) = p_src[Number<src_offset>{}];
});
amd_buffer_store_v2<DstData, DstScalarPerVector>(
dst_vector.Vector(),
p_dst,
dst_slice_origin_coord_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_coord_),
dst_desc.GetElementSpaceSize());
// move along dim3
if constexpr(iter3 < SliceLengths{}[DimAccessOrder{}[I3]] -
dst_scalar_per_access[DimAccessOrder{}[I3]])
{
bool forward_dim2 = (iter1 % 2 == 0);
if(forward_dim2)
if constexpr(forward_dim3)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_p1_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_p1_0);
dst_desc,
dst_slice_origin_coord_,
dst_forward_steps[DimAccessOrder{}[I3]]);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_m1_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_m1_0);
dst_desc,
dst_slice_origin_coord_,
dst_backward_steps[DimAccessOrder{}[I3]]);
}
}
}
});
// move dim1 iterator
if(iter1 < Len1 - 1)
// move along dim2
if constexpr(iter2 < SliceLengths{}[DimAccessOrder{}[I2]] -
dst_scalar_per_access[DimAccessOrder{}[I2]])
{
bool forward_dim1 = (iter0 % 2 == 0);
if(forward_dim1)
if constexpr(forward_dim2)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_p1_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1_0_0);
move_dynamic_tensor_coordinate(dst_desc,
dst_slice_origin_coord_,
dst_forward_steps[DimAccessOrder{}[I2]]);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_m1_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1_0_0);
dst_desc,
dst_slice_origin_coord_,
dst_backward_steps[DimAccessOrder{}[I2]]);
}
}
}
});
// move dim0 iterator:
if(iter0 < Len0 - 1)
// move along dim1
if constexpr(iter1 < SliceLengths{}[DimAccessOrder{}[I1]] -
dst_scalar_per_access[DimAccessOrder{}[I1]])
{
// move forward in dim0
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0_0_0);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0_0_0);
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(dst_desc,
dst_slice_origin_coord_,
dst_forward_steps[DimAccessOrder{}[I1]]);
}
else
{
move_dynamic_tensor_coordinate(dst_desc,
dst_slice_origin_coord_,
dst_backward_steps[DimAccessOrder{}[I1]]);
}
}
}
}
// move src and dst coordinate back to their origins
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_back_step =
make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep());
});
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
}
// move along dim0
if constexpr(iter0 < SliceLengths{}[DimAccessOrder{}[I0]] -
dst_scalar_per_access[DimAccessOrder{}[I0]])
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_coord_, dst_forward_steps[DimAccessOrder{}[I0]]);
}
});
// move dst coordinate back to slice origin (or not)
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_back_step =
make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
make_dynamic_tensor_coordinate_step(dst_desc, GetDstCoordinateBackStep());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_coord_, dst_back_step);
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
__device__ static constexpr auto GetDstCoordinateBackStep()
{
// is it OK to construct a new step every time?
const auto src_slice_origin_step =
make_dynamic_tensor_coordinate_step(src_desc, src_slice_origin_step_idx);
constexpr auto dst_scalar_per_access = [&]() {
Index dst_scalar_per_access;
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_slice_origin_step);
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == DstVectorDim)
{
dst_scalar_per_access(i) = DstScalarPerVector;
}
else
{
dst_scalar_per_access(i) = 1;
}
});
return dst_scalar_per_access;
}();
MultiIndex<nDim> dst_back_step;
// TODO: this is wrong, need to consider DimAccessOrder
dst_back_step(Number<0>{}) = dst_scalar_per_access[Number<0>{}] - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) {
constexpr auto i_m1 = i - Number<1>{};
// TODO: this is wrong
dst_back_step(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0)
? 0
: (dst_scalar_per_access[i] - SliceLengths{}[i]);
});
return dst_back_step;
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetDstCoordinateBackStep();
// is it OK to construct a new step every time?
const auto dst_slice_origin_step =
make_dynamic_tensor_coordinate_step(dst_desc, dst_slice_origin_step_idx);
const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_slice_origin_step);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_coord_, adjusted_step);
}
private:
SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_;
DstCoord dst_slice_origin_coord_;
};
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
......@@ -746,7 +856,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
src_scalar_per_access(i) = SrcScalarPerVector;
}
else
{
......@@ -774,26 +884,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return src_scalar_step_in_vector;
}();
constexpr auto access_lengths = [&]() {
Index access_lengths;
static_for<0, nDim, 1>{}(
[&](auto i) { access_lengths(i) = SliceLengths{}[i] / src_scalar_per_access[i]; });
return access_lengths;
}();
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step(
const auto src_step_0_p = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step(
const auto src_step_0_m = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step(
const auto src_step_p_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step(
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
constexpr auto I0 = Number<0>{};
......@@ -815,18 +916,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem");
#if 0 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#else
vector_type<SrcData, SrcScalarPerVector> src_vector;
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
......@@ -845,26 +934,25 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#endif
// move dim1 iterator
if constexpr(iter1.value < access_lengths[I1] - 1)
if constexpr(iter1.value < SliceLengths{}[I1] - src_scalar_per_access[I1])
{
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p1);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m1);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m);
}
}
});
// move dim0 iterator
if constexpr(iter0.value < access_lengths[I0] - 1)
if constexpr(iter0.value < SliceLengths{}[I0] - src_scalar_per_access[I0])
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p_0);
}
});
......@@ -888,14 +976,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto dst_step_0_p1 =
const auto dst_step_0_p =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
const auto dst_step_0_m =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
const auto dst_step_p_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
const auto dst_step_m_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0];
......@@ -927,12 +1015,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1);
dst_desc, dst_slice_origin_, dst_step_0_p);
}
else
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1);
dst_desc, dst_slice_origin_, dst_step_0_m);
}
}
});
......@@ -940,7 +1028,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// move dim0 iterator
if constexpr(iter0.value < Len0 - 1)
{
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p_0);
}
});
}
......@@ -968,7 +1056,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
src_scalar_per_access(i) = SrcScalarPerVector;
}
else
{
......@@ -1007,54 +1095,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step(
const auto src_step_0_p = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step(
const auto src_step_0_m = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step(
const auto src_step_p_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step(
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
#elif 0
#elif 1
// for padded input tensor
const auto src_step_0_p1 =
const auto src_step_0_p =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(0, 1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
const auto src_step_0_m = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, -1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p1_0 =
const auto src_step_p_0 =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(-1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 1
// for non-padded input tensor
const auto src_step_0_p1 =
const auto src_step_0_p =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(0, 1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
const auto src_step_0_m = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, -1) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p1_0 =
const auto src_step_p_0 =
make_dynamic_tensor_coordinate_step_hack(src_desc,
make_multi_index(1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(-1, 0) * src_scalar_per_access,
Sequence<0, 0, 0, 0, 0, 2, 0>{});
......@@ -1079,28 +1167,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem");
#if 0 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#else
vector_type<SrcData, SrcScalarPerVector> src_vector;
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
#if 1
src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
......@@ -1109,6 +1186,29 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#elif 0
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
buffer_(Number<buffer_offset>{}) = is_valid ? src_vector[i] : SrcData{0};
});
#elif 0
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) + i * src_scalar_step_in_vector);
int32x2_t is_valid_i32 = is_valid;
asm volatile("\n \
v_cmp_gt_u32_e64 is_valid_flag, is_valid_i32, 0 \n \
v_cndmask_b32_e64 src_data, 0, src_data, is_valid_flag \n \
"
: "=s"(is_valid_flag), "=v"(src_data),
: "v"(is_valid_i32), "2"(is_valid_flag), "3"(src_data));
buffer_(Number<buffer_offset>{}) = src_data;
});
#endif
// move dim1 iterator
......@@ -1116,11 +1216,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
{
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p1);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m1);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m);
}
}
});
......@@ -1128,7 +1228,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// move dim0 iterator
if constexpr(iter0.value < access_lengths[I0] - 1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p_0);
}
});
......@@ -1150,7 +1250,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
src_scalar_per_access(i) = SrcScalarPerVector;
}
else
{
......@@ -1167,7 +1267,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<1, nDim, 1>{}([&](auto i) {
constexpr auto i_m1 = i - Number<1>{};
src_back_step(i) = (SliceLengths{}[i_m1] % (2 * src_scalar_per_access[i_m1]) == 0)
src_back_step(i) = (SliceLengths{}[i_m1] % (2 * src_scalar_per_access[i_m1]) == 0)
? 0
: (src_scalar_per_access[i] - SliceLengths{}[i]);
});
......@@ -1183,7 +1284,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == DstVectorDim)
{
dst_scalar_per_access(i) = DstScalarPerVector * DstScalarStrideInVector;
dst_scalar_per_access(i) = DstScalarPerVector;
}
else
{
......@@ -1200,13 +1301,15 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for<1, nDim, 1>{}([&](auto i) {
constexpr auto i_m1 = i - Number<1>{};
dst_back_step(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0)
dst_back_step(i) = (SliceLengths{}[i_m1] % (2 * dst_scalar_per_access[i_m1]) == 0)
? 0
: (dst_scalar_per_access[i] - SliceLengths{}[i]);
});
return dst_back_step;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
......@@ -1250,7 +1353,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif 0
#elif 1
// for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
......
......@@ -97,8 +97,8 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
#endif
// buffer_load requires:
// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
// 2) p_src_thread to be a wavewise pointer.
// 1) p_src_wave must be in global memory space
// 2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
......@@ -118,6 +118,18 @@ __device__ void amd_buffer_store(const T* p_src_thread,
bool dst_thread_data_valid,
index_t dst_data_range);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ void
amd_buffer_store_v2(const typename vector_type<T, VectorSize>::MemoryType src_thread_data,
T* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
......@@ -926,6 +938,126 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
......
......@@ -91,7 +91,7 @@
#endif
// workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
......@@ -123,6 +123,8 @@ using index_t = uint32_t;
using index_t = int32_t;
#endif
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
// int32x4_t use by buffer_load and buffer_store llvm intrinsic
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
......
......@@ -32,7 +32,8 @@ struct static_for
static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
"wrongs! should have NBegin <= NEnd");
"wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
"NBegin >= NEnd)");
}
template <class F>
......
......@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 0
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
constexpr index_t BlockSize = 256;
......@@ -166,7 +166,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 2;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<2, 2>;
......@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver =
#if 0
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
......@@ -22,7 +22,22 @@ int main(int argc, char* argv[])
{
using namespace ck;
#if 1
#if 0
// 3x3, 36x36, stride 2
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 37;
constexpr index_t WI = 37;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 35x35, stride 2
constexpr index_t N = 128;
constexpr index_t C = 192;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment