Commit 7733dd88 authored by Chao Liu's avatar Chao Liu
Browse files

use readfirstlane to force result into SGPR to reduce VGPR usage

parent 3b3cfae5
......@@ -423,17 +423,6 @@ struct DynamicMerge
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
#if 0
// I only want to do this check, if idx_diff_up is know at compile-time
if(idx_diff_up[Number<0>{}] == 0)
{
static_for<0, NDimLow, 1>{}([&idx_diff_low](auto i){
idx_diff_low(i) = 0;
});
return;
}
#endif
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
......@@ -449,7 +438,19 @@ struct DynamicMerge
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
CalculateLowerIndex(idx_diff_low_const, idx_diff_up);
#else
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
......
......@@ -121,9 +121,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v1r1
ThreadwiseTransfer threadwise_transfer_;
};
// this version is very likely to have scratch memory issue, due to:
// this version tend to have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
// 2. ThreadwiseDynamicTensorSliceTransfer_v1r1::Run() constructs new tensor coordinate
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
......@@ -289,7 +289,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r1
// this version does following things to avoid scratch memory issue
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
// 2. ThreadwiseDynamicTensorSliceTransfer_v1r2::Run() does not construct new tensor coordinate
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
......@@ -465,7 +465,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r2
// this version does following things to avoid scratch memory issue
// 1. BlockwiseDynamicTensorSliceTransfer_v2r3 doesn't allocate thread buffer (array) as member
// 2. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 3. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
// 3. ThreadwiseDynamicTensorSliceTransfer_v1r2::Run() does not construct new tensor coordinate
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
......@@ -485,7 +485,9 @@ template <index_t BlockSize,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride,
index_t DstDataStride>
index_t DstDataStride,
index_t ThreadTransferMoveBackSrcCoord = true,
index_t ThreadTransferMoveBackDstCoord = true>
struct BlockwiseDynamicTensorSliceTransfer_v2r3
{
static constexpr index_t nDim =
......@@ -607,20 +609,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_),
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorWriteDim,
1,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>;
1,
ThreadTransferMoveBackSrcCoord,
true>;
using ThreadwiseWrite =
ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_),
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorWriteDim,
1,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride,
true,
ThreadTransferMoveBackDstCoord>;
ThreadwiseRead threadwise_read_;
ThreadwiseWrite threadwise_write_;
......
......@@ -459,11 +459,24 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
const index_t N = b_k_n_global_desc.GetLength(I1);
// divide block work by [M, N]
#if 0
const index_t m_block_work_num = M / MPerBlock;
const index_t n_block_work_num = N / NPerBlock;
#else
// Hack: this force result into SGPR
const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(M / MPerBlock);
const index_t n_block_work_num = __builtin_amdgcn_readfirstlane(N / NPerBlock);
#endif
#if 0
const index_t m_block_work_id = get_block_1d_id() / n_block_work_num;
const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
#else
// Hack: this force result into SGPR
const index_t m_block_work_id =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / n_block_work_num);
const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
#endif
const index_t m_block_data_on_global = m_block_work_id * MPerBlock;
const index_t n_block_data_on_global = n_block_work_id * NPerBlock;
......@@ -505,10 +518,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace::Lds,
InMemoryDataOperation::Set,
1,
1>(a_k_m_global_desc,
make_multi_index(0, m_block_data_on_global),
a_k_m_block_desc,
make_multi_index(0, 0));
1,
true,
true>(
a_k_m_global_desc,
make_multi_index(0, m_block_data_on_global),
a_k_m_block_desc,
make_multi_index(0, 0));
// B matrix blockwise copy
auto b_block_copy =
......@@ -531,10 +547,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace::Lds,
InMemoryDataOperation::Set,
1,
1>(b_k_n_global_desc,
make_multi_index(0, n_block_data_on_global),
b_k_n_block_desc,
make_multi_index(0, 0));
1,
#if 0
true.
#else
false,
#endif
true>(
b_k_n_global_desc,
make_multi_index(0, n_block_data_on_global),
b_k_n_block_desc,
make_multi_index(0, 0));
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
......@@ -599,7 +622,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread);
constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
#if 0
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
#else
// HACK: fuse threadwise copy move-back coordinate with move src slice window
constexpr auto b_block_slice_copy_step =
b_block_copy.threadwise_read_.GetCoordinateStepBack() + make_multi_index(KPerBlock, 0);
#endif
// LDS double buffer: preload data into LDS
{
......
......@@ -7,11 +7,10 @@
namespace ck {
// threadwise_dynamic_tensor_slice_transfer_v1r1 has scratch memory issue, due to
// it constructs new tensor coordinate
template <typename SrcData,
typename DstData,
typename SrcDesc,
// this version tends to have scratch memory issue, due to:
// 1. It keeps reference to tensor descriptor
// 2. It constructs new tensor coordinate in this->Run()
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
......@@ -23,153 +22,59 @@ template <typename SrcData,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
__host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1r1(
const SrcDesc& src_desc,
const DynamicTensorCoordinate_t<SrcDesc>& src_origin_coord,
const SrcData* p_src,
const DstDesc& dst_desc,
const DynamicTensorCoordinate_t<DstDesc>& dst_origin_coord,
DstData* p_dst)
struct ThreadwiseDynamicTensorSliceTransfer_v1r1
{
// comment: construction tensor coordinate here seems to cause scratch memory issue
auto src_coord = src_origin_coord;
auto dst_coord = dst_origin_coord;
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t J0 = SliceLengths{}[0];
constexpr index_t J1 = SliceLengths{}[1];
bool forward_dim0 = true;
bool forward_dim1 = true;
// hardcoded for 2d loop for now
#pragma unroll
for(index_t j0 = 0; j0 < J0; ++j0)
{
#pragma unroll
for(index_t j1 = 0; j1 < J1; ++j1)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord),
src_desc.GetElementSpaceSize(),
p_dst,
dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize());
// move dim1 iterator
if(j1 < J1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_p1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_m1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_m1);
}
}
}
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
// switch dim1 iteration direction
forward_dim1 = !forward_dim1;
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
// move dim0 iterator
if(j0 < J0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_p1_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_m1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_m1_0);
}
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_desc_(src_desc),
src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_desc_(dst_desc),
dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
{
}
}
// threadwise_dynamic_tensor_slice_transfer_v1r2 does not have scratch memory issue, due to
// it does not construct new tensor coordinate
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
__host__ __device__ constexpr void
threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
DynamicTensorCoordinate_t<SrcDesc>& src_coord,
const SrcData* p_src,
const DstDesc& dst_desc,
DynamicTensorCoordinate_t<DstDesc>& dst_coord,
DstData* p_dst)
{
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() ==
remove_reference_t<DstDesc>::GetNumOfDimension(),
"inconsistent # of dimension");
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1()
: ThreadwiseDynamicTensorSliceTransfer_v1r1(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
template <typename SrcData, typename DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
{
// comment: construction tensor coordinate here tends to cause scratch memory issue
auto src_coord = src_slice_origin_;
auto dst_coord = dst_slice_origin_;
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
make_dynamic_tensor_coordinate_step(src_desc_, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
make_dynamic_tensor_coordinate_step(src_desc_, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
make_dynamic_tensor_coordinate_step(src_desc_, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
make_dynamic_tensor_coordinate_step(src_desc_, make_multi_index(-1, 0));
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
make_dynamic_tensor_coordinate_step(dst_desc_, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
make_dynamic_tensor_coordinate_step(dst_desc_, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
make_dynamic_tensor_coordinate_step(dst_desc_, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
make_dynamic_tensor_coordinate_step(dst_desc_, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
......@@ -177,11 +82,12 @@ threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
bool forward_dim0 = true;
bool forward_dim1 = true;
// hardcoded for 2d loop for now
#pragma unroll
for(index_t j0 = 0; j0 < Len0; ++j0)
for(index_t i0 = 0; i0 < Len0; ++i0)
{
#pragma unroll
for(index_t j1 = 0; j1 < Len1; ++j1)
for(index_t i1 = 0; i1 < Len1; ++i1)
{
// do work
transfer_data<SrcData,
......@@ -193,27 +99,27 @@ threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
DstScalarStrideInVector>(
p_src,
src_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc,
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc_,
src_coord),
src_desc.GetElementSpaceSize(),
src_desc_.GetElementSpaceSize(),
p_dst,
dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc,
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc_,
dst_coord),
dst_desc.GetElementSpaceSize());
dst_desc_.GetElementSpaceSize());
// move dim1 iterator
if(j1 < Len1 - 1)
if(i1 < Len1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_p1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_p1);
move_dynamic_tensor_coordinate(src_desc_, src_coord, src_step_0_p1);
move_dynamic_tensor_coordinate(dst_desc_, dst_coord, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_m1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_m1);
move_dynamic_tensor_coordinate(src_desc_, src_coord, src_step_0_m1);
move_dynamic_tensor_coordinate(dst_desc_, dst_coord, dst_step_0_m1);
}
}
}
......@@ -222,274 +128,20 @@ threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
forward_dim1 = !forward_dim1;
// move dim0 iterator
if(j0 < Len0 - 1)
if(i0 < Len0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_p1_0);
move_dynamic_tensor_coordinate(src_desc_, src_coord, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc_, dst_coord, dst_step_p1_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_m1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_m1_0);
move_dynamic_tensor_coordinate(src_desc_, src_coord, src_step_m1_0);
move_dynamic_tensor_coordinate(dst_desc_, dst_coord, dst_step_m1_0);
}
}
}
// move src and dst coordinate back to their origins
// move src and dst coordinate back to their origins
constexpr index_t loc0 = Len0 - 1;
constexpr index_t loc1 = Len0 % 2 == 0 ? 0 : Len1 - 1;
const auto src_step_back =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-loc0, -loc1));
const auto dst_step_back =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-loc0, -loc1));
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_back);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_back);
}
else if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
const auto dst_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
const auto dst_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
const auto dst_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
const auto dst_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
constexpr index_t Len2 = SliceLengths{}[2];
constexpr index_t Len3 = SliceLengths{}[3];
bool forward_dim0 = true;
bool forward_dim1 = true;
bool forward_dim2 = true;
bool forward_dim3 = true;
#pragma unroll
for(index_t j0 = 0; j0 < Len0; ++j0)
{
#pragma unroll
for(index_t j1 = 0; j1 < Len1; ++j1)
{
#pragma unroll
for(index_t j2 = 0; j2 < Len2; ++j2)
{
#pragma unroll
for(index_t j3 = 0; j3 < Len3; ++j3)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc,
src_coord),
src_desc.GetElementSpaceSize(),
p_dst,
dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc,
dst_coord),
dst_desc.GetElementSpaceSize());
// move dim1 iterator
if(j3 < Len3 - 1)
{
if(forward_dim3)
{
move_dynamic_tensor_coordinate(
src_desc, src_coord, src_step_0_0_0_p1);
move_dynamic_tensor_coordinate(
dst_desc, dst_coord, dst_step_0_0_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_coord, src_step_0_0_0_m1);
move_dynamic_tensor_coordinate(
dst_desc, dst_coord, dst_step_0_0_0_m1);
}
}
}
// switch dim3 iteration direction
forward_dim3 = !forward_dim3;
// move dim1 iterator
if(j2 < Len2 - 1)
{
if(forward_dim2)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_0_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_0_p1_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_0_m1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_0_m1_0);
}
}
}
// switch dim2 iteration direction
forward_dim2 = !forward_dim2;
// move dim1 iterator
if(j1 < Len1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_p1_0_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_p1_0_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_m1_0_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_m1_0_0);
}
}
}
// switch dim1 iteration direction
forward_dim1 = !forward_dim1;
// move dim0 iterator
if(j0 < Len0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_p1_0_0_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_p1_0_0_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_m1_0_0_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_m1_0_0_0);
}
}
}
// move src and dst coordinate back to their origins
constexpr index_t loc0 = Len0 - 1;
constexpr index_t loc1 = Len0 % 2 == 0 ? 0 : Len1 - 1;
constexpr index_t loc2 = Len1 % 2 == 0 ? 0 : Len2 - 1;
constexpr index_t loc3 = Len2 % 2 == 0 ? 0 : Len3 - 1;
const auto src_step_back = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-loc0, -loc1, -loc2, -loc3));
const auto dst_step_back = make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(-loc0, -loc1, -loc2, -loc3));
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_back);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_back);
}
}
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
struct ThreadwiseDynamicTensorSliceTransfer_v1r1
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_desc_(src_desc),
src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_desc_(dst_desc),
dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
{
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1()
: ThreadwiseDynamicTensorSliceTransfer_v1r1(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
template <typename SrcData, typename DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
{
threadwise_dynamic_tensor_slice_transfer_v1r1<SrcData,
DstData,
SrcDesc,
DstDesc,
SliceLengths,
SrcDstDimAccessOrder,
SrcDstVectorDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
src_desc_, src_slice_origin_, p_src, dst_desc_, dst_slice_origin_, p_dst);
}
__device__ void SetSrcSliceOrigin(const Index& src_slice_origin_idx)
......@@ -505,6 +157,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r1
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const Index& src_slice_origin_step_idx)
{
// is it OK to construct a new step every time?
const auto src_slice_origin_step =
make_dynamic_tensor_coordinate_step(src_desc_, src_slice_origin_step_idx);
......@@ -514,6 +167,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r1
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const Index& dst_slice_origin_step_idx)
{
// is it OK to construct a new step every time?
const auto dst_slice_origin_step =
make_dynamic_tensor_coordinate_step(dst_desc_, dst_slice_origin_step_idx);
......@@ -528,9 +182,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r1
DstCoord dst_slice_origin_;
};
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
// this version is less likely to have scratch memory issue, due to:
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
......@@ -542,7 +196,9 @@ template <typename SrcDesc,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
index_t DstScalarStrideInVector,
bool MoveBackSrcCoord = true,
bool MoveBackDstCoord = true>
struct ThreadwiseDynamicTensorSliceTransfer_v1r2
{
static constexpr index_t nDim = SliceLengths::Size();
......@@ -573,21 +229,302 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
__device__ void
Run(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
{
threadwise_dynamic_tensor_slice_transfer_v1r2<SrcData,
DstData,
SrcDesc,
DstDesc,
SliceLengths,
SrcDstDimAccessOrder,
SrcDstVectorDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
src_desc, src_slice_origin_, p_src, dst_desc, dst_slice_origin_, p_dst);
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
bool forward_dim0 = true;
bool forward_dim1 = true;
#pragma unroll
for(index_t i0 = 0; i0 < Len0; ++i0)
{
#pragma unroll
for(index_t i1 = 0; i1 < Len1; ++i1)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize(),
p_dst,
dst_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_),
dst_desc.GetElementSpaceSize());
// move dim1 iterator
if(i1 < Len1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_p1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_m1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1);
}
}
}
// switch dim1 iteration direction
forward_dim1 = !forward_dim1;
// move dim0 iterator
if(i0 < Len0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_m1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_m1_0);
}
}
}
}
else if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
const auto dst_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
const auto dst_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
const auto dst_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
const auto dst_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
constexpr index_t Len2 = SliceLengths{}[2];
constexpr index_t Len3 = SliceLengths{}[3];
bool forward_dim0 = true;
bool forward_dim1 = true;
bool forward_dim2 = true;
bool forward_dim3 = true;
#pragma unroll
for(index_t i0 = 0; i0 < Len0; ++i0)
{
#pragma unroll
for(index_t i1 = 0; i1 < Len1; ++i1)
{
#pragma unroll
for(index_t i2 = 0; i2 < Len2; ++i2)
{
#pragma unroll
for(index_t i3 = 0; i3 < Len3; ++i3)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize(),
p_dst,
dst_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_),
dst_desc.GetElementSpaceSize());
// move dim1 iterator
if(i3 < Len3 - 1)
{
if(forward_dim3)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_0_p1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_0_m1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_0_m1);
}
}
}
// switch dim3 iteration direction
forward_dim3 = !forward_dim3;
// move dim1 iterator
if(i2 < Len2 - 1)
{
if(forward_dim2)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_p1_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_p1_0);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_m1_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_m1_0);
}
}
}
// switch dim2 iteration direction
forward_dim2 = !forward_dim2;
// move dim1 iterator
if(i1 < Len1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_p1_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1_0_0);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_m1_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1_0_0);
}
}
}
// switch dim1 iteration direction
forward_dim1 = !forward_dim1;
// move dim0 iterator
if(i0 < Len0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_p1_0_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_p1_0_0_0);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_m1_0_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_m1_0_0_0);
}
}
}
}
// move src and dst coordinate back to their origins
if constexpr(MoveBackSrcCoord)
{
const auto src_step_back =
make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateStepBack());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_back);
}
if constexpr(MoveBackDstCoord)
{
const auto dst_step_back =
make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateStepBack());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_back);
}
}
__device__ static constexpr auto GetCoordinateStepBack()
{
MultiIndex<nDim> step_back;
step_back(Number<0>{}) = 1 - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) {
step_back(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
});
return step_back;
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
......@@ -604,7 +541,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// is it OK to do this every time?
// is it OK to construct a new step every time?
const auto src_slice_origin_step =
make_dynamic_tensor_coordinate_step(src_desc, src_slice_origin_step_idx);
......@@ -615,7 +552,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// is it OK to do this every time?
// is it OK to construct a new step every time?
const auto dst_slice_origin_step =
make_dynamic_tensor_coordinate_step(dst_desc, dst_slice_origin_step_idx);
......
......@@ -74,6 +74,14 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif
// workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment