Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7733dd88
Commit
7733dd88
authored
Nov 28, 2020
by
Chao Liu
Browse files
use readfirstlane to force result into SGPR to reduce VGPR usage
parent
3b3cfae5
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
447 additions
and
464 deletions
+447
-464
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+12
-11
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+26
-19
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+38
-8
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+363
-426
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+8
-0
No files found.
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
7733dd88
...
@@ -423,17 +423,6 @@ struct DynamicMerge
...
@@ -423,17 +423,6 @@ struct DynamicMerge
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
"wrong! inconsistent # of dimension"
);
#if 0
// I only want to do this check, if idx_diff_up is know at compile-time
if(idx_diff_up[Number<0>{}] == 0)
{
static_for<0, NDimLow, 1>{}([&idx_diff_low](auto i){
idx_diff_low(i) = 0;
});
return;
}
#endif
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
...
@@ -449,7 +438,19 @@ struct DynamicMerge
...
@@ -449,7 +438,19 @@ struct DynamicMerge
// computed at
// computed at
// run-time each time this function is called, and can be very expensive.
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
CalculateLowerIndex
(
idx_diff_low_const
,
idx_diff_up
);
CalculateLowerIndex
(
idx_diff_low_const
,
idx_diff_up
);
#else
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
#endif
// do carry check on each low dimension in reversed order
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
// do not need to check the first dimension
...
...
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
View file @
7733dd88
...
@@ -121,9 +121,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v1r1
...
@@ -121,9 +121,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v1r1
ThreadwiseTransfer
threadwise_transfer_
;
ThreadwiseTransfer
threadwise_transfer_
;
};
};
// this version
is very likely
to have scratch memory issue, due to:
// this version
tend
to have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2.
t
hreadwise
_d
ynamic
_t
ensor
_s
lice
_t
ransfer_v1r1 constructs new tensor coordinate
// 2.
T
hreadwise
D
ynamic
T
ensor
S
lice
T
ransfer_v1r1
::Run()
constructs new tensor coordinate
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockDstData
,
...
@@ -289,7 +289,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r1
...
@@ -289,7 +289,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r1
// this version does following things to avoid scratch memory issue
// this version does following things to avoid scratch memory issue
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2.
t
hreadwise
_d
ynamic
_t
ensor
_s
lice
_t
ransfer_v1r2 does not construct new tensor coordinate
// 2.
T
hreadwise
D
ynamic
T
ensor
S
lice
T
ransfer_v1r2
::Run()
does not construct new tensor coordinate
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockDstData
,
...
@@ -465,7 +465,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r2
...
@@ -465,7 +465,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r2
// this version does following things to avoid scratch memory issue
// this version does following things to avoid scratch memory issue
// 1. BlockwiseDynamicTensorSliceTransfer_v2r3 doesn't allocate thread buffer (array) as member
// 1. BlockwiseDynamicTensorSliceTransfer_v2r3 doesn't allocate thread buffer (array) as member
// 2. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 3.
t
hreadwise
_d
ynamic
_t
ensor
_s
lice
_t
ransfer_v1r2 does not construct new tensor coordinate
// 3.
T
hreadwise
D
ynamic
T
ensor
S
lice
T
ransfer_v1r2
::Run()
does not construct new tensor coordinate
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockDstData
,
...
@@ -485,7 +485,9 @@ template <index_t BlockSize,
...
@@ -485,7 +485,9 @@ template <index_t BlockSize,
AddressSpace
DstAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
index_t
DstDataStride
,
index_t
ThreadTransferMoveBackSrcCoord
=
true
,
index_t
ThreadTransferMoveBackDstCoord
=
true
>
struct
BlockwiseDynamicTensorSliceTransfer_v2r3
struct
BlockwiseDynamicTensorSliceTransfer_v2r3
{
{
static
constexpr
index_t
nDim
=
static
constexpr
index_t
nDim
=
...
@@ -607,20 +609,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3
...
@@ -607,20 +609,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3
AddressSpace
::
Vgpr
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
SrcDataStride
,
SrcDataStride
,
1
>
;
1
,
ThreadTransferMoveBackSrcCoord
,
using
ThreadwiseWrite
=
ThreadwiseDynamicTensorSliceTransfer_v1r2
<
decltype
(
thread_buffer_desc_
),
true
>
;
BlockDstDesc
,
ThreadSliceLengths
,
using
ThreadwiseWrite
=
DstDimAccessOrder
,
ThreadwiseDynamicTensorSliceTransfer_v1r2
<
decltype
(
thread_buffer_desc_
),
DstVectorWriteDim
,
BlockDstDesc
,
1
,
ThreadSliceLengths
,
DstDataPerWrite
,
DstDimAccessOrder
,
AddressSpace
::
Vgpr
,
DstVectorWriteDim
,
DstAddressSpace
,
1
,
DstInMemOp
,
DstDataPerWrite
,
1
,
AddressSpace
::
Vgpr
,
DstDataStride
>
;
DstAddressSpace
,
DstInMemOp
,
1
,
DstDataStride
,
true
,
ThreadTransferMoveBackDstCoord
>
;
ThreadwiseRead
threadwise_read_
;
ThreadwiseRead
threadwise_read_
;
ThreadwiseWrite
threadwise_write_
;
ThreadwiseWrite
threadwise_write_
;
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
7733dd88
...
@@ -459,11 +459,24 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
...
@@ -459,11 +459,24 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
const
index_t
N
=
b_k_n_global_desc
.
GetLength
(
I1
);
const
index_t
N
=
b_k_n_global_desc
.
GetLength
(
I1
);
// divide block work by [M, N]
// divide block work by [M, N]
#if 0
const index_t m_block_work_num = M / MPerBlock;
const index_t m_block_work_num = M / MPerBlock;
const index_t n_block_work_num = N / NPerBlock;
const index_t n_block_work_num = N / NPerBlock;
#else
// Hack: this force result into SGPR
const
index_t
m_block_work_num
=
__builtin_amdgcn_readfirstlane
(
M
/
MPerBlock
);
const
index_t
n_block_work_num
=
__builtin_amdgcn_readfirstlane
(
N
/
NPerBlock
);
#endif
#if 0
const index_t m_block_work_id = get_block_1d_id() / n_block_work_num;
const index_t m_block_work_id = get_block_1d_id() / n_block_work_num;
const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
#else
// Hack: this force result into SGPR
const
index_t
m_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
n_block_work_num
);
const
index_t
n_block_work_id
=
get_block_1d_id
()
-
m_block_work_id
*
n_block_work_num
;
#endif
const
index_t
m_block_data_on_global
=
m_block_work_id
*
MPerBlock
;
const
index_t
m_block_data_on_global
=
m_block_work_id
*
MPerBlock
;
const
index_t
n_block_data_on_global
=
n_block_work_id
*
NPerBlock
;
const
index_t
n_block_data_on_global
=
n_block_work_id
*
NPerBlock
;
...
@@ -505,10 +518,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
...
@@ -505,10 +518,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace
::
Lds
,
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
1
,
1
,
1
>
(
a_k_m_global_desc
,
1
,
make_multi_index
(
0
,
m_block_data_on_global
),
true
,
a_k_m_block_desc
,
true
>
(
make_multi_index
(
0
,
0
));
a_k_m_global_desc
,
make_multi_index
(
0
,
m_block_data_on_global
),
a_k_m_block_desc
,
make_multi_index
(
0
,
0
));
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_block_copy
=
auto
b_block_copy
=
...
@@ -531,10 +547,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
...
@@ -531,10 +547,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace
::
Lds
,
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
1
,
1
,
1
>
(
b_k_n_global_desc
,
1
,
make_multi_index
(
0
,
n_block_data_on_global
),
#if 0
b_k_n_block_desc
,
true.
make_multi_index
(
0
,
0
));
#else
false
,
#endif
true
>
(
b_k_n_global_desc
,
make_multi_index
(
0
,
n_block_data_on_global
),
b_k_n_block_desc
,
make_multi_index
(
0
,
0
));
// GEMM definition
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
...
@@ -599,7 +622,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
...
@@ -599,7 +622,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
threadwise_matrix_set_zero
(
c_m0m1_n0n1_thread_mtx_desc
,
p_c_thread
);
threadwise_matrix_set_zero
(
c_m0m1_n0n1_thread_mtx_desc
,
p_c_thread
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
#if 0
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
#else
// HACK: fuse threadwise copy move-back coordinate with move src slice window
constexpr
auto
b_block_slice_copy_step
=
b_block_copy
.
threadwise_read_
.
GetCoordinateStepBack
()
+
make_multi_index
(
KPerBlock
,
0
);
#endif
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
7733dd88
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/config.amd.hpp.in
View file @
7733dd88
...
@@ -74,6 +74,14 @@
...
@@ -74,6 +74,14 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
#endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif
// workaround: put all workaround here
// workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
#ifndef CK_WORKAROUND_SWDEV_229564
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment