Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
68ea43b1
Commit
68ea43b1
authored
Jan 27, 2021
by
Chao Liu
Browse files
add vector load and access order to threadwise copy v1r3
parent
5fdccfce
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
780 additions
and
346 deletions
+780
-346
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+159
-7
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+29
-0
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+433
-330
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+134
-2
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+3
-1
composable_kernel/include/utility/functional2.hpp
composable_kernel/include/utility/functional2.hpp
+2
-1
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+3
-3
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+16
-1
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
68ea43b1
...
...
@@ -492,7 +492,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
0
,
GemmABlockTransferSrcScalarPerVector_GemmK
,
GemmABlockTransferDstScalarPerVector_GemmM
,
tru
e
,
// move back src coordinate after threadwise copy
fals
e
,
//
don't
move back src coordinate after threadwise copy
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
,
Sequence
<
0
,
1
>
,
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
68ea43b1
...
...
@@ -502,11 +502,11 @@ struct DynamicMerge
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex_1
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
const
UpIdx
&
/* idx_up_new */
,
Number
<
Hack
>
)
const
__host__
__device__
void
UpdateLowerIndex_1
a
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
const
UpIdx
&
/* idx_up_new */
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
...
...
@@ -640,6 +640,148 @@ struct DynamicMerge
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex_1b
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
const
UpIdx
&
/* idx_up_new */
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
idx_low_length_plus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
]);
idx_low_length_plus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
});
#endif
if
constexpr
(
Hack
==
1
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_low
+=
idx_diff_low
;
}
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
borrow
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
negative_idx_low_tmp
=
borrow
-
idx_low
[
i
];
bool
do_borrow
=
negative_idx_low_tmp
>
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
-=
borrow
;
borrow
=
do_borrow
?
1
:
0
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
borrow
;
idx_low
+=
idx_diff_low
;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
carry
=
do_borrow
?
-
1
:
carry
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_low
+=
idx_diff_low
;
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
...
...
@@ -705,11 +847,15 @@ struct DynamicMerge
do_carry
=
idx_low_tmp
>=
low_lengths_
[
i
];
#if 0
// TODO: use exec-mask inline asm
if(do_carry)
{
idx_diff_low(i) -= low_lengths_[i];
}
#else
idx_diff_low
(
i
)
=
do_carry
?
idx_diff_low
[
i
]
-
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
#endif
idx_low
(
i
)
+=
idx_diff_low
[
i
];
});
...
...
@@ -733,11 +879,15 @@ struct DynamicMerge
do_borrow
=
idx_low_tmp
<
0
;
#if 0
// TODO: use exec-mask inline asm
if(do_borrow)
{
idx_diff_low(i) += low_lengths_[i];
}
#else
idx_diff_low
(
i
)
=
do_borrow
?
idx_diff_low
[
i
]
+
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
#endif
idx_low
(
i
)
+=
idx_diff_low
[
i
];
});
...
...
@@ -765,8 +915,10 @@ struct DynamicMerge
const
UpIdx
&
idx_up_new
,
Number
<
Hack
>
)
const
{
#if 1
UpdateLowerIndex_1
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
#if 0
UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#elif
0
UpdateLowerIndex_1b
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
#else
UpdateLowerIndex_2
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
#endif
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
68ea43b1
...
...
@@ -4,6 +4,7 @@
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
#include "ConstantMatrixDescriptor.hpp"
...
...
@@ -364,9 +365,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
#if 0 // debug
constexpr auto c_m0_m1_n0_n1_thread_desc =
make_dynamic_naive_tensor_descriptor_packed<4>(
make_multi_index(MRepeat, MPerThread, NRepeat, NPerThread));
#else
constexpr
auto
c_m0_m1_n0_n1_thread_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
MRepeat
,
MPerThread
,
NRepeat
,
NPerThread
>
{});
#endif
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
...
...
@@ -379,6 +385,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
const
index_t
n_thread_data_on_global
=
n_block_data_on_global
+
c_thread_mtx_on_block
.
col
;
#if 0
ThreadwiseDynamicTensorSliceTransfer_v1r2<
AccFloat,
Float,
...
...
@@ -404,6 +411,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
n_thread_data_on_global % N1))
.Run_hack(
c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
#else
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
AccFloat
,
Float
,
decltype
(
c_m0_m1_n0_n1_thread_desc
),
decltype
(
c_m0_m1_n0_n1_global_desc
),
Sequence
<
MRepeat
,
MPerThread
,
NRepeat
,
NPerThread
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
CGlobalMemoryDataOperation
,
1
,
true
,
true
>
(
c_m0_m1_n0_n1_global_desc
,
make_multi_index
(
m_thread_data_on_global
/
M1
,
m_thread_data_on_global
%
M1
,
n_thread_data_on_global
/
N1
,
n_thread_data_on_global
%
N1
))
.
Run_hack
(
p_c_thread
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
);
#endif
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
68ea43b1
...
...
@@ -67,7 +67,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
}
__device__
void
Run
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
Run
_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
...
...
@@ -152,7 +152,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
}
else
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
#if 0
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
...
...
@@ -192,6 +193,48 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#else
// hack for output tensor
const
auto
src_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
1
));
const
auto
src_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
));
const
auto
src_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
1
,
0
));
const
auto
src_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
));
const
auto
src_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
src_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
src_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
src_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
const
auto
dst_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
),
Sequence
<
0
,
0
,
1
,
0
,
0
>
{});
const
auto
dst_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
),
Sequence
<
0
,
0
,
2
,
0
,
0
>
{});
const
auto
dst_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
),
Sequence
<
0
,
0
,
1
,
0
,
0
>
{});
const
auto
dst_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
),
Sequence
<
0
,
0
,
2
,
0
,
0
>
{});
const
auto
dst_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
dst_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
dst_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
dst_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
#endif
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
...
...
@@ -336,331 +379,398 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
return
back_step
;
}
__device__
void
Run_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
// is it OK to construct a new step every time?
const
auto
src_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
src_slice_origin_step_idx
);
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_slice_origin_step
);
}
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// is it OK to construct a new step every time?
const
auto
dst_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
dst_slice_origin_step_idx
);
constexpr
index_t
Len0
=
SliceLengths
{}[
0
]
;
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_slice_origin_step
)
;
}
#pragma unroll
for
(
index_t
iter0
=
0
;
iter0
<
Len0
;
++
iter0
)
{
#pragma unroll
for
(
index_t
iter1
=
0
;
iter1
<
Len1
;
++
iter1
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_
),
dst_desc
.
GetElementSpaceSize
());
private:
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
// move dim1 iterator
if
(
iter1
<
Len1
-
1
)
{
bool
forward_dim1
=
(
iter0
%
2
==
0
);
// this version is less likely to have scratch memory issue, due to:
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
template
<
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
DstVectorDim
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
DstScalarStrideInVector
,
bool
SrcResetCoordinateAfterRun
,
bool
DstResetCoordinateAfterRun
>
struct
ThreadwiseDynamicTensorSliceTransfer_v1r3
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m1
);
}
}
}
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
// move dim0 iterator
if
(
iter0
<
Len0
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0
);
}
}
}
else
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
#if 0
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r3
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
:
dst_slice_origin_coord_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
))
{
}
const auto src_step_0_p1_0_0 =
make_d
ynamic
_t
ensor
_coordinate_step(dst_d
esc, make_multi_index
(0, 1, 0, 0
))
;
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r3
()
:
ThreadwiseD
ynamic
T
ensor
SliceTransfer_v1r3
(
DstD
esc
{}
,
make_
zero_
multi_index
<
nDim
>
(
))
{
}
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_slice_origin_coord_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
const auto dst_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
const auto dst_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
__device__
void
Run_hack
(
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
// hardcoded for 4D
// TODO implemente N-D
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
,
"wrong! hardcoded for 4D tensor"
);
const auto dst_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
const auto dst_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
Index
dst_scalar_per_access
;
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
{
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
;
}
else
{
dst_scalar_per_access
(
i
)
=
1
;
}
});
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#else
// hack for output tensor
const
auto
src_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
1
));
const
auto
src_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
));
return
dst_scalar_per_access
;
}();
const
auto
src_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
1
,
0
));
const
auto
src_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
));
constexpr
auto
dst_scalar_step_in_vector
=
[
&
]()
{
Index
dst_scalar_step_in_vector
;
const
auto
src_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
src_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
{
dst_scalar_step_in_vector
(
i
)
=
1
;
}
else
{
dst_scalar_step_in_vector
(
i
)
=
0
;
}
});
const
auto
src_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
src_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
return
dst_scalar_step_in_vector
;
}();
const
auto
dst_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
),
Sequence
<
0
,
0
,
1
,
0
,
0
>
{});
const
auto
dst_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
),
Sequence
<
0
,
0
,
2
,
0
,
0
>
{});
constexpr
auto
access_lengths
=
[
&
]()
{
Index
access_lengths
;
const
auto
dst_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
),
Sequence
<
0
,
0
,
1
,
0
,
0
>
{});
const
auto
dst_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
),
Sequence
<
0
,
0
,
2
,
0
,
0
>
{});
static_for
<
0
,
nDim
,
1
>
{}(
[
&
](
auto
i
)
{
access_lengths
(
i
)
=
SliceLengths
{}[
i
]
/
dst_scalar_per_access
[
i
];
});
const
auto
dst_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
dst_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
return
access_lengths
;
}();
const
auto
dst_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
dst_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
#if 0
const auto dst_forward_steps =
make_tuple(make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, 1, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, 0, 1) * dst_scalar_per_access),
const auto dst_backward_steps =
make_tuple(make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(-1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, -1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, -1, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
dst_desc, make_multi_index(0, 0, 0, -1) * dst_scalar_per_access));
#else
// hack for NKHW output tensor
const
auto
dst_forward_steps
=
make_tuple
(
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
)
*
dst_scalar_per_access
),
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
)
*
dst_scalar_per_access
),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
)
*
dst_scalar_per_access
,
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
)
*
dst_scalar_per_access
,
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}));
const
auto
dst_backward_steps
=
make_tuple
(
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
)
*
dst_scalar_per_access
),
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
)
*
dst_scalar_per_access
),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
)
*
dst_scalar_per_access
,
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
)
*
dst_scalar_per_access
,
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}));
#endif
constexpr
index_t
Len0
=
SliceLengths
{}[
0
]
;
constexpr
index_t
Len1
=
SliceLengths
{}[
1
]
;
constexpr
index_t
Len2
=
SliceLengths
{}[
2
]
;
constexpr
index_t
Len3
=
SliceLengths
{}[
3
]
;
constexpr
auto
I0
=
Number
<
0
>
{}
;
constexpr
auto
I1
=
Number
<
1
>
{}
;
constexpr
auto
I2
=
Number
<
2
>
{}
;
constexpr
auto
I3
=
Number
<
3
>
{}
;
#pragma unroll
for
(
index_t
iter0
=
0
;
iter0
<
Len0
;
++
iter0
)
{
#pragma unroll
for
(
index_t
iter1
=
0
;
iter1
<
Len1
;
++
iter1
)
{
#pragma unroll
for
(
index_t
iter2
=
0
;
iter2
<
Len2
;
++
iter2
)
{
#pragma unroll
for
(
index_t
iter3
=
0
;
iter3
<
Len3
;
++
iter3
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_
),
dst_desc
.
GetElementSpaceSize
());
index_t
counter
=
0
;
// move dim1 iterator
if
(
iter3
<
Len3
-
1
)
{
bool
forward_dim3
=
(
iter2
%
2
==
0
);
// loop over dim0
static_for
<
0
,
SliceLengths
{}[
DimAccessOrder
{}[
I0
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]]
>
{}([
&
](
auto
iter0
)
{
constexpr
index_t
i0
=
iter0
;
if
(
forward_dim3
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_0_m1
);
}
}
}
constexpr
bool
forward_dim1
=
(
iter0
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
%
2
==
0
;
// move dim1 iterator
if
(
iter2
<
Len2
-
1
)
// loop over dim1
static_for
<
0
,
SliceLengths
{}[
DimAccessOrder
{}[
I1
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]
>
{}([
&
](
auto
iter1
)
{
constexpr
index_t
i1
=
forward_dim1
?
iter1
:
SliceLengths
{}[
DimAccessOrder
{}[
I1
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]
-
iter1
;
constexpr
bool
forward_dim2
=
((
iter0
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
*
access_lengths
[
DimAccessOrder
{}[
I1
]]
+
(
iter1
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]))
%
2
==
0
;
// loop over dim2
static_for
<
0
,
SliceLengths
{}[
DimAccessOrder
{}[
I2
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]]
>
{}([
&
](
auto
iter2
)
{
constexpr
index_t
i2
=
forward_dim2
?
iter2
:
SliceLengths
{}[
DimAccessOrder
{}[
I2
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]]
-
iter2
;
constexpr
bool
forward_dim3
=
(((
iter0
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
*
access_lengths
[
DimAccessOrder
{}[
I1
]]
+
(
iter1
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]]))
*
access_lengths
[
DimAccessOrder
{}[
I2
]]
+
(
iter2
/
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]]))
%
2
==
0
;
// loop over dim3
static_for
<
0
,
SliceLengths
{}[
DimAccessOrder
{}[
I3
]],
dst_scalar_per_access
[
DimAccessOrder
{}[
I3
]]
>
{}([
&
](
auto
iter3
)
{
constexpr
index_t
i3
=
forward_dim3
?
iter3
:
SliceLengths
{}[
DimAccessOrder
{}[
I3
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I3
]]
-
iter3
;
// do work
// hardcoding for buffer_store
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Vgpr
&&
DstAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_store"
);
using
DstVectorType
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
MemoryType
;
vector_type
<
DstData
,
DstScalarPerVector
>
dst_vector
;
// this is hardcoded for src that has compile-time tensor descriptor
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
// hack: assume src_slice_origin_idx is 0
constexpr
index_t
src_offset
=
SrcDesc
::
CalculateOffset
(
container_reorder_given_old2new
(
make_multi_index
(
i0
,
i1
,
i2
,
i3
),
DimAccessOrder
{})
+
i
*
dst_scalar_step_in_vector
);
dst_vector
(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
});
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
dst_vector
.
Vector
(),
p_dst
,
dst_slice_origin_coord_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_coord_
),
dst_desc
.
GetElementSpaceSize
());
// move along dim3
if
constexpr
(
iter3
<
SliceLengths
{}[
DimAccessOrder
{}[
I3
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I3
]])
{
bool
forward_dim2
=
(
iter1
%
2
==
0
);
if
(
forward_dim2
)
if
constexpr
(
forward_dim3
)
{
move_dynamic_tensor_coordinate
(
src
_desc
,
src_slice_origin_
,
src_step_0_0_p1_0
);
move_dynamic_tensor_coordinate
(
dst_
desc
,
dst_slice_origin_
,
dst_step_0_0_p1_0
);
dst
_desc
,
dst_slice_origin_coord_
,
dst_
forward_steps
[
DimAccessOrder
{}[
I3
]]
);
}
else
{
move_dynamic_tensor_coordinate
(
src
_desc
,
src_slice_origin_
,
src_step_0_0_m1_0
);
move_dynamic_tensor_coordinate
(
dst_
desc
,
dst_slice_origin_
,
dst_step_0_0_m1_0
);
dst
_desc
,
dst_slice_origin_coord_
,
dst_
backward_steps
[
DimAccessOrder
{}[
I3
]]
);
}
}
}
}
);
// move dim1 iterator
if
(
iter1
<
Len1
-
1
)
// move along dim2
if
constexpr
(
iter2
<
SliceLengths
{}[
DimAccessOrder
{}[
I2
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I2
]])
{
bool
forward_dim1
=
(
iter0
%
2
==
0
);
if
(
forward_dim1
)
if
constexpr
(
forward_dim2
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_steps
[
DimAccessOrder
{}[
I2
]]);
}
else
{
move_dynamic_tensor_coordinate
(
src
_desc
,
src_slice_origin_
,
src_step_0_m1_0_0
);
move_dynamic_tensor_coordinate
(
dst_
desc
,
dst_slice_origin_
,
dst_step_0_m1_0_0
);
dst
_desc
,
dst_slice_origin_coord_
,
dst_
backward_steps
[
DimAccessOrder
{}[
I2
]]
);
}
}
}
}
);
// move dim0 iterator:
if
(
iter0
<
Len0
-
1
)
// move along dim1
if
constexpr
(
iter1
<
SliceLengths
{}[
DimAccessOrder
{}[
I1
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I1
]])
{
// move forward in dim0
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0_0_0
);
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_steps
[
DimAccessOrder
{}[
I1
]]);
}
else
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_backward_steps
[
DimAccessOrder
{}[
I1
]]);
}
}
}
}
// move src and dst coordinate back to their origins
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
const
auto
src_back_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
GetCoordinateBackStep
());
});
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
}
// move along dim0
if
constexpr
(
iter0
<
SliceLengths
{}[
DimAccessOrder
{}[
I0
]]
-
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_steps
[
DimAccessOrder
{}[
I0
]]);
}
});
// move dst coordinate back to slice origin (or not)
if
constexpr
(
DstResetCoordinateAfterRun
)
{
const
auto
dst_back_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
GetCoordinateBackStep
());
make_dynamic_tensor_coordinate_step
(
dst_desc
,
Get
Dst
CoordinateBackStep
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_back_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
coord_
,
dst_back_step
);
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
__device__
static
constexpr
auto
GetDstCoordinateBackStep
()
{
// is it OK to construct a new step every time?
const
auto
src_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
src_slice_origin_step_idx
);
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
Index
dst_scalar_per_access
;
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_slice_origin_step
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
{
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
;
}
else
{
dst_scalar_per_access
(
i
)
=
1
;
}
});
return
dst_scalar_per_access
;
}();
MultiIndex
<
nDim
>
dst_back_step
;
// TODO: this is wrong, need to consider DimAccessOrder
dst_back_step
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
// TODO: this is wrong
dst_back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
?
0
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
return
dst_back_step
;
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetDstCoordinateBackStep
();
// is it OK to construct a new step every time?
const
auto
dst_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
dst_slice_origin_step_idx
);
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_slice_origin
_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
coord_
,
adjusted
_step
);
}
private:
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
DstCoord
dst_slice_origin_coord_
;
};
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
...
...
@@ -746,7 +856,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
*
SrcScalarStrideInVector
;
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
;
}
else
{
...
...
@@ -774,26 +884,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return
src_scalar_step_in_vector
;
}();
constexpr
auto
access_lengths
=
[
&
]()
{
Index
access_lengths
;
static_for
<
0
,
nDim
,
1
>
{}(
[
&
](
auto
i
)
{
access_lengths
(
i
)
=
SliceLengths
{}[
i
]
/
src_scalar_per_access
[
i
];
});
return
access_lengths
;
}();
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p
1
=
make_dynamic_tensor_coordinate_step
(
const
auto
src_step_0_p
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
);
const
auto
src_step_0_m
1
=
make_dynamic_tensor_coordinate_step
(
const
auto
src_step_0_m
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
);
const
auto
src_step_p
1
_0
=
make_dynamic_tensor_coordinate_step
(
const
auto
src_step_p_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
);
const
auto
src_step_m
1
_0
=
make_dynamic_tensor_coordinate_step
(
const
auto
src_step_m_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
);
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -815,18 +916,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
#if 0 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#else
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
...
...
@@ -845,26 +934,25 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#endif
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
access_l
engths
[
I1
]
-
1
)
if
constexpr
(
iter1
.
value
<
SliceL
engths
{}
[
I1
]
-
src_scalar_per_access
[
I1
]
)
{
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p
1
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m
1
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m
);
}
}
});
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
access_l
engths
[
I0
]
-
1
)
if
constexpr
(
iter0
.
value
<
SliceL
engths
{}
[
I0
]
-
src_scalar_per_access
[
I0
]
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p
1
_0
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p_0
);
}
});
...
...
@@ -888,14 +976,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
dst_step_0_p
1
=
const
auto
dst_step_0_p
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m
1
=
const
auto
dst_step_0_m
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p
1
_0
=
const
auto
dst_step_p_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m
1
_0
=
const
auto
dst_step_m_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
...
...
@@ -927,12 +1015,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p
1
);
dst_desc
,
dst_slice_origin_
,
dst_step_0_p
);
}
else
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m
1
);
dst_desc
,
dst_slice_origin_
,
dst_step_0_m
);
}
}
});
...
...
@@ -940,7 +1028,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
Len0
-
1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p
1
_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p_0
);
}
});
}
...
...
@@ -968,7 +1056,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
*
SrcScalarStrideInVector
;
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
;
}
else
{
...
...
@@ -1007,54 +1095,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p
1
= make_dynamic_tensor_coordinate_step(
const auto src_step_0_p = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m
1
= make_dynamic_tensor_coordinate_step(
const auto src_step_0_m = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_p
1
_0 = make_dynamic_tensor_coordinate_step(
const auto src_step_p_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m
1
_0 = make_dynamic_tensor_coordinate_step(
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
#elif
0
#elif
1
// for padded input tensor
const
auto
src_step_0_p
1
=
const
auto
src_step_0_p
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m
1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_m
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p
1
_0
=
const
auto
src_step_p_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m
1
_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_m_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 1
// for non-padded input tensor
const
auto
src_step_0_p
1
=
const
auto
src_step_0_p
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m
1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_m
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p
1
_0
=
const
auto
src_step_p_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m
1
_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_m_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
...
...
@@ -1079,28 +1167,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
#if 0 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#else
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
#if 1
src_vector
.
Vector
()
=
is_valid
?
src_vector
.
Vector
()
:
SrcVectorType
{
0
};
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
...
...
@@ -1109,6 +1186,29 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#elif 0
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
is_valid
?
src_vector
[
i
]
:
SrcData
{
0
};
});
#elif 0
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
int32x2_t
is_valid_i32
=
is_valid
;
asm
volatile
(
"
\n
\
v_cmp_gt_u32_e64 is_valid_flag, is_valid_i32, 0
\n
\
v_cndmask_b32_e64 src_data, 0, src_data, is_valid_flag
\n
\
"
:
"=s"
(
is_valid_flag
),
"=v"
(
src_data
),
:
"v"
(
is_valid_i32
),
"2"
(
is_valid_flag
),
"3"
(
src_data
));
buffer_
(
Number
<
buffer_offset
>
{})
=
src_data
;
});
#endif
// move dim1 iterator
...
...
@@ -1116,11 +1216,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
{
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p
1
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m
1
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m
);
}
}
});
...
...
@@ -1128,7 +1228,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
access_lengths
[
I0
]
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p
1
_0
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p_0
);
}
});
...
...
@@ -1150,7 +1250,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
*
SrcScalarStrideInVector
;
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
;
}
else
{
...
...
@@ -1167,7 +1267,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
src_back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
src_scalar_per_access
[
i_m1
])
==
0
)
src_back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
src_scalar_per_access
[
i_m1
])
==
0
)
?
0
:
(
src_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
...
...
@@ -1183,7 +1284,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
{
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
*
DstScalarStrideInVector
;
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
;
}
else
{
...
...
@@ -1200,13 +1301,15 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
dst_back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
dst_back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
?
0
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
return
dst_back_step
;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
...
...
@@ -1250,7 +1353,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif
0
#elif
1
// for padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
68ea43b1
...
...
@@ -97,8 +97,8 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
#endif
// buffer_load requires:
// 1) p_src_
thread
must be in global memory space
, p_dst_thread must be vgpr
// 2) p_src_
thread
to be a wavewise pointer.
// 1) p_src_
wave
must be in global memory space
// 2) p_src_
wave
to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
...
...
@@ -118,6 +118,18 @@ __device__ void amd_buffer_store(const T* p_src_thread,
bool
dst_thread_data_valid
,
index_t
dst_data_range
);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
amd_buffer_store_v2
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
src_thread_data
,
T
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
...
...
@@ -926,6 +938,126 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
1
>
(
const
float
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
2
>
(
const
float2_t
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
4
>
(
const
float4_t
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
1
>
(
const
float
*
p_src_thread
,
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
68ea43b1
...
...
@@ -91,7 +91,7 @@
#endif
// workaround: put all workaround here
// workaround for unnecessary VGP
A
<--> AG
R
P data movement when using mfma LLVM intrinsic
// workaround for unnecessary VGP
R
<--> AGP
R
data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
...
...
@@ -123,6 +123,8 @@ using index_t = uint32_t;
using index_t = int32_t;
#endif
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
// int32x4_t use by buffer_load and buffer_store llvm intrinsic
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
...
...
composable_kernel/include/utility/functional2.hpp
View file @
68ea43b1
...
...
@@ -32,7 +32,8 @@ struct static_for
static_assert
(
Increment
!=
0
&&
(
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
static_assert
((
Increment
>
0
&&
NBegin
<=
NEnd
)
||
(
Increment
<
0
&&
NBegin
>=
NEnd
),
"wrongs! should have NBegin <= NEnd"
);
"wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
"NBegin >= NEnd)"
);
}
template
<
class
F
>
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
68ea43b1
...
...
@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#elif
0
#elif
1
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -166,7 +166,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
2
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
2
,
2
>
;
...
...
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
conv_driver
=
#if
0
#if
1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
...
driver/src/conv_driver.cpp
View file @
68ea43b1
...
...
@@ -22,7 +22,22 @@ int main(int argc, char* argv[])
{
using
namespace
ck
;
#if 1
#if 0
// 3x3, 36x36, stride 2
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 37;
constexpr index_t WI = 37;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif
0
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment