Commit 7733dd88 authored by Chao Liu's avatar Chao Liu
Browse files

use readfirstlane to force result into SGPR to reduce VGPR usage

parent 3b3cfae5
...@@ -423,17 +423,6 @@ struct DynamicMerge ...@@ -423,17 +423,6 @@ struct DynamicMerge
LowIdx::Size() == NDimLow && UpIdx::Size() == 1, LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
#if 0
// I only want to do this check, if idx_diff_up is know at compile-time
if(idx_diff_up[Number<0>{}] == 0)
{
static_for<0, NDimLow, 1>{}([&idx_diff_low](auto i){
idx_diff_low(i) = 0;
});
return;
}
#endif
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions. // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However, // However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const // 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
...@@ -449,7 +438,19 @@ struct DynamicMerge ...@@ -449,7 +438,19 @@ struct DynamicMerge
// computed at // computed at
// run-time each time this function is called, and can be very expensive. // run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const; LowerIndex idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
CalculateLowerIndex(idx_diff_low_const, idx_diff_up); CalculateLowerIndex(idx_diff_low_const, idx_diff_up);
#else
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
// do carry check on each low dimension in reversed order // do carry check on each low dimension in reversed order
// do not need to check the first dimension // do not need to check the first dimension
......
...@@ -121,9 +121,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v1r1 ...@@ -121,9 +121,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v1r1
ThreadwiseTransfer threadwise_transfer_; ThreadwiseTransfer threadwise_transfer_;
}; };
// this version is very likely to have scratch memory issue, due to: // this version tend to have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor // 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate // 2. ThreadwiseDynamicTensorSliceTransfer_v1r1::Run() constructs new tensor coordinate
template <index_t BlockSize, template <index_t BlockSize,
typename BlockSrcData, typename BlockSrcData,
typename BlockDstData, typename BlockDstData,
...@@ -289,7 +289,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r1 ...@@ -289,7 +289,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r1
// this version does following things to avoid scratch memory issue // this version does following things to avoid scratch memory issue
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor // 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate // 2. ThreadwiseDynamicTensorSliceTransfer_v1r2::Run() does not construct new tensor coordinate
template <index_t BlockSize, template <index_t BlockSize,
typename BlockSrcData, typename BlockSrcData,
typename BlockDstData, typename BlockDstData,
...@@ -465,7 +465,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r2 ...@@ -465,7 +465,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r2
// this version does following things to avoid scratch memory issue // this version does following things to avoid scratch memory issue
// 1. BlockwiseDynamicTensorSliceTransfer_v2r3 doesn't allocate thread buffer (array) as member // 1. BlockwiseDynamicTensorSliceTransfer_v2r3 doesn't allocate thread buffer (array) as member
// 2. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor // 2. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 3. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate // 3. ThreadwiseDynamicTensorSliceTransfer_v1r2::Run() does not construct new tensor coordinate
template <index_t BlockSize, template <index_t BlockSize,
typename BlockSrcData, typename BlockSrcData,
typename BlockDstData, typename BlockDstData,
...@@ -485,7 +485,9 @@ template <index_t BlockSize, ...@@ -485,7 +485,9 @@ template <index_t BlockSize,
AddressSpace DstAddressSpace, AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp, InMemoryDataOperation DstInMemOp,
index_t SrcDataStride, index_t SrcDataStride,
index_t DstDataStride> index_t DstDataStride,
index_t ThreadTransferMoveBackSrcCoord = true,
index_t ThreadTransferMoveBackDstCoord = true>
struct BlockwiseDynamicTensorSliceTransfer_v2r3 struct BlockwiseDynamicTensorSliceTransfer_v2r3
{ {
static constexpr index_t nDim = static constexpr index_t nDim =
...@@ -607,9 +609,12 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3 ...@@ -607,9 +609,12 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3
AddressSpace::Vgpr, AddressSpace::Vgpr,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
SrcDataStride, SrcDataStride,
1>; 1,
ThreadTransferMoveBackSrcCoord,
true>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_), using ThreadwiseWrite =
ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_),
BlockDstDesc, BlockDstDesc,
ThreadSliceLengths, ThreadSliceLengths,
DstDimAccessOrder, DstDimAccessOrder,
...@@ -620,7 +625,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3 ...@@ -620,7 +625,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3
DstAddressSpace, DstAddressSpace,
DstInMemOp, DstInMemOp,
1, 1,
DstDataStride>; DstDataStride,
true,
ThreadTransferMoveBackDstCoord>;
ThreadwiseRead threadwise_read_; ThreadwiseRead threadwise_read_;
ThreadwiseWrite threadwise_write_; ThreadwiseWrite threadwise_write_;
......
...@@ -459,11 +459,24 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2 ...@@ -459,11 +459,24 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
const index_t N = b_k_n_global_desc.GetLength(I1); const index_t N = b_k_n_global_desc.GetLength(I1);
// divide block work by [M, N] // divide block work by [M, N]
#if 0
const index_t m_block_work_num = M / MPerBlock; const index_t m_block_work_num = M / MPerBlock;
const index_t n_block_work_num = N / NPerBlock; const index_t n_block_work_num = N / NPerBlock;
#else
// Hack: this force result into SGPR
const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(M / MPerBlock);
const index_t n_block_work_num = __builtin_amdgcn_readfirstlane(N / NPerBlock);
#endif
#if 0
const index_t m_block_work_id = get_block_1d_id() / n_block_work_num; const index_t m_block_work_id = get_block_1d_id() / n_block_work_num;
const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num; const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
#else
// Hack: this force result into SGPR
const index_t m_block_work_id =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / n_block_work_num);
const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
#endif
const index_t m_block_data_on_global = m_block_work_id * MPerBlock; const index_t m_block_data_on_global = m_block_work_id * MPerBlock;
const index_t n_block_data_on_global = n_block_work_id * NPerBlock; const index_t n_block_data_on_global = n_block_work_id * NPerBlock;
...@@ -505,7 +518,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2 ...@@ -505,7 +518,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace::Lds, AddressSpace::Lds,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
1, 1,
1>(a_k_m_global_desc, 1,
true,
true>(
a_k_m_global_desc,
make_multi_index(0, m_block_data_on_global), make_multi_index(0, m_block_data_on_global),
a_k_m_block_desc, a_k_m_block_desc,
make_multi_index(0, 0)); make_multi_index(0, 0));
...@@ -531,7 +547,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2 ...@@ -531,7 +547,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace::Lds, AddressSpace::Lds,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
1, 1,
1>(b_k_n_global_desc, 1,
#if 0
true.
#else
false,
#endif
true>(
b_k_n_global_desc,
make_multi_index(0, n_block_data_on_global), make_multi_index(0, n_block_data_on_global),
b_k_n_block_desc, b_k_n_block_desc,
make_multi_index(0, 0)); make_multi_index(0, 0));
...@@ -599,7 +622,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2 ...@@ -599,7 +622,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread); threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread);
constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0); constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
#if 0
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0); constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
#else
// HACK: fuse threadwise copy move-back coordinate with move src slice window
constexpr auto b_block_slice_copy_step =
b_block_copy.threadwise_read_.GetCoordinateStepBack() + make_multi_index(KPerBlock, 0);
#endif
// LDS double buffer: preload data into LDS // LDS double buffer: preload data into LDS
{ {
......
...@@ -74,6 +74,14 @@ ...@@ -74,6 +74,14 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif #endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif
// workaround: put all workaround here // workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic // workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564 #ifndef CK_WORKAROUND_SWDEV_229564
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment