Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7733dd88
"git@developer.sourcefind.cn:wuxk1/megatron-lm.git" did not exist on "279d83202a6647daf156ea94d67c4ffb86744ea7"
Commit
7733dd88
authored
Nov 28, 2020
by
Chao Liu
Browse files
use readfirstlane to force result into SGPR to reduce VGPR usage
parent
3b3cfae5
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
447 additions
and
464 deletions
+447
-464
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+12
-11
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+26
-19
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+38
-8
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+363
-426
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+8
-0
No files found.
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
7733dd88
...
...
@@ -423,17 +423,6 @@ struct DynamicMerge
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
#if 0
// I only want to do this check, if idx_diff_up is know at compile-time
if(idx_diff_up[Number<0>{}] == 0)
{
static_for<0, NDimLow, 1>{}([&idx_diff_low](auto i){
idx_diff_low(i) = 0;
});
return;
}
#endif
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
...
...
@@ -449,7 +438,19 @@ struct DynamicMerge
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
CalculateLowerIndex
(
idx_diff_low_const
,
idx_diff_up
);
#else
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
#endif
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
...
...
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
View file @
7733dd88
...
...
@@ -121,9 +121,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v1r1
ThreadwiseTransfer
threadwise_transfer_
;
};
// this version
is very likely
to have scratch memory issue, due to:
// this version
tend
to have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2.
t
hreadwise
_d
ynamic
_t
ensor
_s
lice
_t
ransfer_v1r1 constructs new tensor coordinate
// 2.
T
hreadwise
D
ynamic
T
ensor
S
lice
T
ransfer_v1r1
::Run()
constructs new tensor coordinate
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockDstData
,
...
...
@@ -289,7 +289,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r1
// this version does following things to avoid scratch memory issue
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2.
t
hreadwise
_d
ynamic
_t
ensor
_s
lice
_t
ransfer_v1r2 does not construct new tensor coordinate
// 2.
T
hreadwise
D
ynamic
T
ensor
S
lice
T
ransfer_v1r2
::Run()
does not construct new tensor coordinate
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockDstData
,
...
...
@@ -465,7 +465,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r2
// this version does following things to avoid scratch memory issue
// 1. BlockwiseDynamicTensorSliceTransfer_v2r3 doesn't allocate thread buffer (array) as member
// 2. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 3.
t
hreadwise
_d
ynamic
_t
ensor
_s
lice
_t
ransfer_v1r2 does not construct new tensor coordinate
// 3.
T
hreadwise
D
ynamic
T
ensor
S
lice
T
ransfer_v1r2
::Run()
does not construct new tensor coordinate
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockDstData
,
...
...
@@ -485,7 +485,9 @@ template <index_t BlockSize,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
index_t
DstDataStride
,
index_t
ThreadTransferMoveBackSrcCoord
=
true
,
index_t
ThreadTransferMoveBackDstCoord
=
true
>
struct
BlockwiseDynamicTensorSliceTransfer_v2r3
{
static
constexpr
index_t
nDim
=
...
...
@@ -607,20 +609,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
SrcDataStride
,
1
>
;
using
ThreadwiseWrite
=
ThreadwiseDynamicTensorSliceTransfer_v1r2
<
decltype
(
thread_buffer_desc_
),
BlockDstDesc
,
ThreadSliceLengths
,
DstDimAccessOrder
,
DstVectorWriteDim
,
1
,
DstDataPerWrite
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstInMemOp
,
1
,
DstDataStride
>
;
1
,
ThreadTransferMoveBackSrcCoord
,
true
>
;
using
ThreadwiseWrite
=
ThreadwiseDynamicTensorSliceTransfer_v1r2
<
decltype
(
thread_buffer_desc_
),
BlockDstDesc
,
ThreadSliceLengths
,
DstDimAccessOrder
,
DstVectorWriteDim
,
1
,
DstDataPerWrite
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstInMemOp
,
1
,
DstDataStride
,
true
,
ThreadTransferMoveBackDstCoord
>
;
ThreadwiseRead
threadwise_read_
;
ThreadwiseWrite
threadwise_write_
;
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
7733dd88
...
...
@@ -459,11 +459,24 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
const
index_t
N
=
b_k_n_global_desc
.
GetLength
(
I1
);
// divide block work by [M, N]
#if 0
const index_t m_block_work_num = M / MPerBlock;
const index_t n_block_work_num = N / NPerBlock;
#else
// Hack: this force result into SGPR
const
index_t
m_block_work_num
=
__builtin_amdgcn_readfirstlane
(
M
/
MPerBlock
);
const
index_t
n_block_work_num
=
__builtin_amdgcn_readfirstlane
(
N
/
NPerBlock
);
#endif
#if 0
const index_t m_block_work_id = get_block_1d_id() / n_block_work_num;
const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
#else
// Hack: this force result into SGPR
const
index_t
m_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
n_block_work_num
);
const
index_t
n_block_work_id
=
get_block_1d_id
()
-
m_block_work_id
*
n_block_work_num
;
#endif
const
index_t
m_block_data_on_global
=
m_block_work_id
*
MPerBlock
;
const
index_t
n_block_data_on_global
=
n_block_work_id
*
NPerBlock
;
...
...
@@ -505,10 +518,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
a_k_m_global_desc
,
make_multi_index
(
0
,
m_block_data_on_global
),
a_k_m_block_desc
,
make_multi_index
(
0
,
0
));
1
,
true
,
true
>
(
a_k_m_global_desc
,
make_multi_index
(
0
,
m_block_data_on_global
),
a_k_m_block_desc
,
make_multi_index
(
0
,
0
));
// B matrix blockwise copy
auto
b_block_copy
=
...
...
@@ -531,10 +547,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
,
1
,
1
>
(
b_k_n_global_desc
,
make_multi_index
(
0
,
n_block_data_on_global
),
b_k_n_block_desc
,
make_multi_index
(
0
,
0
));
1
,
#if 0
true.
#else
false
,
#endif
true
>
(
b_k_n_global_desc
,
make_multi_index
(
0
,
n_block_data_on_global
),
b_k_n_block_desc
,
make_multi_index
(
0
,
0
));
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
...
...
@@ -599,7 +622,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
threadwise_matrix_set_zero
(
c_m0m1_n0n1_thread_mtx_desc
,
p_c_thread
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
#if 0
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
#else
// HACK: fuse threadwise copy move-back coordinate with move src slice window
constexpr
auto
b_block_slice_copy_step
=
b_block_copy
.
threadwise_read_
.
GetCoordinateStepBack
()
+
make_multi_index
(
KPerBlock
,
0
);
#endif
// LDS double buffer: preload data into LDS
{
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
7733dd88
...
...
@@ -7,11 +7,10 @@
namespace
ck
{
// threadwise_dynamic_tensor_slice_transfer_v1r1 has scratch memory issue, due to
// it constructs new tensor coordinate
template
<
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
// this version tends to have scratch memory issue, due to:
// 1. It keeps reference to tensor descriptor
// 2. It constructs new tensor coordinate in this->Run()
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDstDimAccessOrder
,
...
...
@@ -23,153 +22,59 @@ template <typename SrcData,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
__host__
__device__
constexpr
void
threadwise_dynamic_tensor_slice_transfer_v1r1
(
const
SrcDesc
&
src_desc
,
const
DynamicTensorCoordinate_t
<
SrcDesc
>&
src_origin_coord
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
const
DynamicTensorCoordinate_t
<
DstDesc
>&
dst_origin_coord
,
DstData
*
p_dst
)
struct
ThreadwiseDynamicTensorSliceTransfer_v1r1
{
// comment: construction tensor coordinate here seems to cause scratch memory issue
auto
src_coord
=
src_origin_coord
;
auto
dst_coord
=
dst_origin_coord
;
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
J0
=
SliceLengths
{}[
0
];
constexpr
index_t
J1
=
SliceLengths
{}[
1
];
bool
forward_dim0
=
true
;
bool
forward_dim1
=
true
;
// hardcoded for 2d loop for now
#pragma unroll
for
(
index_t
j0
=
0
;
j0
<
J0
;
++
j0
)
{
#pragma unroll
for
(
index_t
j1
=
0
;
j1
<
J1
;
++
j1
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_coord
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord
),
dst_desc
.
GetElementSpaceSize
());
// move dim1 iterator
if
(
j1
<
J1
-
1
)
{
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_m1
);
}
}
}
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
// switch dim1 iteration direction
forward_dim1
=
!
forward_dim1
;
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}))
;
// move dim0 iterator
if
(
j0
<
J0
-
1
)
{
if
(
forward_dim0
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_m1_0
);
}
}
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
:
src_desc_
(
src_desc
),
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_desc_
(
dst_desc
),
dst_slice_origin_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
}
}
// threadwise_dynamic_tensor_slice_transfer_v1r2 does not have scratch memory issue, due to
// it does not construct new tensor coordinate
template
<
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDstDimAccessOrder
,
index_t
SrcDstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
__host__
__device__
constexpr
void
threadwise_dynamic_tensor_slice_transfer_v1r2
(
const
SrcDesc
&
src_desc
,
DynamicTensorCoordinate_t
<
SrcDesc
>&
src_coord
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DynamicTensorCoordinate_t
<
DstDesc
>&
dst_coord
,
DstData
*
p_dst
)
{
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
remove_reference_t
<
DstDesc
>::
GetNumOfDimension
(),
"inconsistent # of dimension"
);
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r1
()
:
ThreadwiseDynamicTensorSliceTransfer_v1r1
(
SrcDesc
{},
make_zero_multi_index
<
nDim
>
(),
DstDesc
{},
make_zero_multi_index
<
nDim
>
())
{
}
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
// comment: construction tensor coordinate here tends to cause scratch memory issue
auto
src_coord
=
src_slice_origin_
;
auto
dst_coord
=
dst_slice_origin_
;
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
make_dynamic_tensor_coordinate_step
(
src_desc
_
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
make_dynamic_tensor_coordinate_step
(
src_desc_
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
make_dynamic_tensor_coordinate_step
(
src_desc
_
,
make_multi_index
(
1
,
0
));
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
make_dynamic_tensor_coordinate_step
(
src_desc
_
,
make_multi_index
(
-
1
,
0
));
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
make_dynamic_tensor_coordinate_step
(
dst_desc
_
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
make_dynamic_tensor_coordinate_step
(
dst_desc_
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
make_dynamic_tensor_coordinate_step
(
dst_desc
_
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
make_dynamic_tensor_coordinate_step
(
dst_desc
_
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
...
...
@@ -177,11 +82,12 @@ threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
bool
forward_dim0
=
true
;
bool
forward_dim1
=
true
;
// hardcoded for 2d loop for now
#pragma unroll
for
(
index_t
j
0
=
0
;
j
0
<
Len0
;
++
j
0
)
for
(
index_t
i
0
=
0
;
i
0
<
Len0
;
++
i
0
)
{
#pragma unroll
for
(
index_t
j
1
=
0
;
j
1
<
Len1
;
++
j
1
)
for
(
index_t
i
1
=
0
;
i
1
<
Len1
;
++
i
1
)
{
// do work
transfer_data
<
SrcData
,
...
...
@@ -193,27 +99,27 @@ threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
DstScalarStrideInVector
>
(
p_src
,
src_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
_
,
src_coord
),
src_desc
.
GetElementSpaceSize
(),
src_desc
_
.
GetElementSpaceSize
(),
p_dst
,
dst_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
_
,
dst_coord
),
dst_desc
.
GetElementSpaceSize
());
dst_desc
_
.
GetElementSpaceSize
());
// move dim1 iterator
if
(
j
1
<
Len1
-
1
)
if
(
i
1
<
Len1
-
1
)
{
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_p1
);
move_dynamic_tensor_coordinate
(
src_desc
_
,
src_coord
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
_
,
dst_coord
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_m1
);
move_dynamic_tensor_coordinate
(
src_desc
_
,
src_coord
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
_
,
dst_coord
,
dst_step_0_m1
);
}
}
}
...
...
@@ -222,274 +128,20 @@ threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
forward_dim1
=
!
forward_dim1
;
// move dim0 iterator
if
(
j
0
<
Len0
-
1
)
if
(
i
0
<
Len0
-
1
)
{
if
(
forward_dim0
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_p1_0
);
move_dynamic_tensor_coordinate
(
src_desc
_
,
src_coord
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
_
,
dst_coord
,
dst_step_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_m1_0
);
move_dynamic_tensor_coordinate
(
src_desc
_
,
src_coord
,
src_step_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
_
,
dst_coord
,
dst_step_m1_0
);
}
}
}
// move src and dst coordinate back to their origins
// move src and dst coordinate back to their origins
constexpr
index_t
loc0
=
Len0
-
1
;
constexpr
index_t
loc1
=
Len0
%
2
==
0
?
0
:
Len1
-
1
;
const
auto
src_step_back
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
loc0
,
-
loc1
));
const
auto
dst_step_back
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
loc0
,
-
loc1
));
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_back
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_back
);
}
else
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
1
));
const
auto
src_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
));
const
auto
src_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
1
,
0
));
const
auto
src_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
));
const
auto
src_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
src_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
src_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
src_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
const
auto
dst_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
));
const
auto
dst_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
));
const
auto
dst_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
));
const
auto
dst_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
));
const
auto
dst_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
dst_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
dst_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
dst_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
constexpr
index_t
Len2
=
SliceLengths
{}[
2
];
constexpr
index_t
Len3
=
SliceLengths
{}[
3
];
bool
forward_dim0
=
true
;
bool
forward_dim1
=
true
;
bool
forward_dim2
=
true
;
bool
forward_dim3
=
true
;
#pragma unroll
for
(
index_t
j0
=
0
;
j0
<
Len0
;
++
j0
)
{
#pragma unroll
for
(
index_t
j1
=
0
;
j1
<
Len1
;
++
j1
)
{
#pragma unroll
for
(
index_t
j2
=
0
;
j2
<
Len2
;
++
j2
)
{
#pragma unroll
for
(
index_t
j3
=
0
;
j3
<
Len3
;
++
j3
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_coord
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord
),
dst_desc
.
GetElementSpaceSize
());
// move dim1 iterator
if
(
j3
<
Len3
-
1
)
{
if
(
forward_dim3
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_0_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_0_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_0_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_0_0_m1
);
}
}
}
// switch dim3 iteration direction
forward_dim3
=
!
forward_dim3
;
// move dim1 iterator
if
(
j2
<
Len2
-
1
)
{
if
(
forward_dim2
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_0_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_0_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_0_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_0_m1_0
);
}
}
}
// switch dim2 iteration direction
forward_dim2
=
!
forward_dim2
;
// move dim1 iterator
if
(
j1
<
Len1
-
1
)
{
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_p1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_p1_0_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_m1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_m1_0_0
);
}
}
}
// switch dim1 iteration direction
forward_dim1
=
!
forward_dim1
;
// move dim0 iterator
if
(
j0
<
Len0
-
1
)
{
if
(
forward_dim0
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_p1_0_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_p1_0_0_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_m1_0_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_m1_0_0_0
);
}
}
}
// move src and dst coordinate back to their origins
constexpr
index_t
loc0
=
Len0
-
1
;
constexpr
index_t
loc1
=
Len0
%
2
==
0
?
0
:
Len1
-
1
;
constexpr
index_t
loc2
=
Len1
%
2
==
0
?
0
:
Len2
-
1
;
constexpr
index_t
loc3
=
Len2
%
2
==
0
?
0
:
Len3
-
1
;
const
auto
src_step_back
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
loc0
,
-
loc1
,
-
loc2
,
-
loc3
));
const
auto
dst_step_back
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
loc0
,
-
loc1
,
-
loc2
,
-
loc3
));
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_back
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_back
);
}
}
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDstDimAccessOrder
,
index_t
SrcDstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
struct
ThreadwiseDynamicTensorSliceTransfer_v1r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
:
src_desc_
(
src_desc
),
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_desc_
(
dst_desc
),
dst_slice_origin_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
}
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r1
()
:
ThreadwiseDynamicTensorSliceTransfer_v1r1
(
SrcDesc
{},
make_zero_multi_index
<
nDim
>
(),
DstDesc
{},
make_zero_multi_index
<
nDim
>
())
{
}
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
threadwise_dynamic_tensor_slice_transfer_v1r1
<
SrcData
,
DstData
,
SrcDesc
,
DstDesc
,
SliceLengths
,
SrcDstDimAccessOrder
,
SrcDstVectorDim
,
SrcScalarPerVector
,
DstScalarPerVector
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
src_desc_
,
src_slice_origin_
,
p_src
,
dst_desc_
,
dst_slice_origin_
,
p_dst
);
}
__device__
void
SetSrcSliceOrigin
(
const
Index
&
src_slice_origin_idx
)
...
...
@@ -505,6 +157,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r1
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
Index
&
src_slice_origin_step_idx
)
{
// is it OK to construct a new step every time?
const
auto
src_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
src_desc_
,
src_slice_origin_step_idx
);
...
...
@@ -514,6 +167,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r1
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
Index
&
dst_slice_origin_step_idx
)
{
// is it OK to construct a new step every time?
const
auto
dst_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc_
,
dst_slice_origin_step_idx
);
...
...
@@ -528,9 +182,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r1
DstCoord
dst_slice_origin_
;
};
// this version
does not
have scratch memory issue, due to:
// 1.
ThreadwiseDynamicTensorSliceTransfer_v1r2
does not keep reference to tensor descriptor
// 2. t
hreadwise_dynamic_tensor_slice_transfer_v1r2
does not construct new tensor coordinate
// this version
is less likely to
have scratch memory issue, due to:
// 1.
It
does not keep reference to tensor descriptor
// 2.
I
t does not construct new tensor coordinate
for this->Run()
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
...
...
@@ -542,7 +196,9 @@ template <typename SrcDesc,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
index_t
DstScalarStrideInVector
,
bool
MoveBackSrcCoord
=
true
,
bool
MoveBackDstCoord
=
true
>
struct
ThreadwiseDynamicTensorSliceTransfer_v1r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
...
...
@@ -573,21 +229,302 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
__device__
void
Run
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
threadwise_dynamic_tensor_slice_transfer_v1r2
<
SrcData
,
DstData
,
SrcDesc
,
DstDesc
,
SliceLengths
,
SrcDstDimAccessOrder
,
SrcDstVectorDim
,
SrcScalarPerVector
,
DstScalarPerVector
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
src_desc
,
src_slice_origin_
,
p_src
,
dst_desc
,
dst_slice_origin_
,
p_dst
);
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
bool
forward_dim0
=
true
;
bool
forward_dim1
=
true
;
#pragma unroll
for
(
index_t
i0
=
0
;
i0
<
Len0
;
++
i0
)
{
#pragma unroll
for
(
index_t
i1
=
0
;
i1
<
Len1
;
++
i1
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_
),
dst_desc
.
GetElementSpaceSize
());
// move dim1 iterator
if
(
i1
<
Len1
-
1
)
{
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m1
);
}
}
}
// switch dim1 iteration direction
forward_dim1
=
!
forward_dim1
;
// move dim0 iterator
if
(
i0
<
Len0
-
1
)
{
if
(
forward_dim0
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_m1_0
);
}
}
}
}
else
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
1
));
const
auto
src_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
));
const
auto
src_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
1
,
0
));
const
auto
src_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
));
const
auto
src_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
src_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
src_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
src_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
const
auto
dst_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
));
const
auto
dst_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
));
const
auto
dst_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
));
const
auto
dst_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
));
const
auto
dst_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
dst_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
dst_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
dst_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
constexpr
index_t
Len2
=
SliceLengths
{}[
2
];
constexpr
index_t
Len3
=
SliceLengths
{}[
3
];
bool
forward_dim0
=
true
;
bool
forward_dim1
=
true
;
bool
forward_dim2
=
true
;
bool
forward_dim3
=
true
;
#pragma unroll
for
(
index_t
i0
=
0
;
i0
<
Len0
;
++
i0
)
{
#pragma unroll
for
(
index_t
i1
=
0
;
i1
<
Len1
;
++
i1
)
{
#pragma unroll
for
(
index_t
i2
=
0
;
i2
<
Len2
;
++
i2
)
{
#pragma unroll
for
(
index_t
i3
=
0
;
i3
<
Len3
;
++
i3
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_
),
dst_desc
.
GetElementSpaceSize
());
// move dim1 iterator
if
(
i3
<
Len3
-
1
)
{
if
(
forward_dim3
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_0_m1
);
}
}
}
// switch dim3 iteration direction
forward_dim3
=
!
forward_dim3
;
// move dim1 iterator
if
(
i2
<
Len2
-
1
)
{
if
(
forward_dim2
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_m1_0
);
}
}
}
// switch dim2 iteration direction
forward_dim2
=
!
forward_dim2
;
// move dim1 iterator
if
(
i1
<
Len1
-
1
)
{
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1_0_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m1_0_0
);
}
}
}
// switch dim1 iteration direction
forward_dim1
=
!
forward_dim1
;
// move dim0 iterator
if
(
i0
<
Len0
-
1
)
{
if
(
forward_dim0
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0_0_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_m1_0_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_m1_0_0_0
);
}
}
}
}
// move src and dst coordinate back to their origins
if
constexpr
(
MoveBackSrcCoord
)
{
const
auto
src_step_back
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
GetCoordinateStepBack
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_back
);
}
if
constexpr
(
MoveBackDstCoord
)
{
const
auto
dst_step_back
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
GetCoordinateStepBack
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_back
);
}
}
__device__
static
constexpr
auto
GetCoordinateStepBack
()
{
MultiIndex
<
nDim
>
step_back
;
step_back
(
Number
<
0
>
{})
=
1
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
step_back
(
i
)
=
(
SliceLengths
{}[
i
-
Number
<
1
>
{}]
%
2
==
0
)
?
0
:
(
1
-
SliceLengths
{}[
i
]);
});
return
step_back
;
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
...
...
@@ -604,7 +541,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
// is it OK to
do this
every time?
// is it OK to
construct a new step
every time?
const
auto
src_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
src_slice_origin_step_idx
);
...
...
@@ -615,7 +552,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// is it OK to
do this
every time?
// is it OK to
construct a new step
every time?
const
auto
dst_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
dst_slice_origin_step_idx
);
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
7733dd88
...
...
@@ -74,6 +74,14 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif
// workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment