Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
681ede91
Commit
681ede91
authored
Dec 19, 2021
by
Chao Liu
Browse files
clean up
parent
edb1d2c3
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
73 additions
and
94 deletions
+73
-94
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
+14
-57
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r3.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v3r3.hpp
+11
-7
device_operation/include/device_conv2d_fwd_xdl_output_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
...xdl_output_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+19
-10
device_operation/include/device_conv2d_fwd_xdl_output_shuffle_nhwc_kyxc_nhwk.hpp
...e/device_conv2d_fwd_xdl_output_shuffle_nhwc_kyxc_nhwk.hpp
+19
-10
example/4_conv2d_fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
..._fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
+5
-5
example/6_conv2d_fwd_xdl_output_shuffle_bias_relu_add/conv2d_fwd_xdl_output_shuffle_bias_relu_add.cpp
..._relu_add/conv2d_fwd_xdl_output_shuffle_bias_relu_add.cpp
+5
-5
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
View file @
681ede91
...
...
@@ -83,6 +83,7 @@ template <index_t BlockSize,
index_t
ABlockTransferSrcScalarPerVector
,
index_t
ABlockTransferDstScalarPerVector_K1
,
bool
AThreadTransferSrcResetCoordinateAfterRun
,
bool
ABlockLdsExtraM
,
typename
BBlockTransferThreadSliceLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
...
...
@@ -91,11 +92,14 @@ template <index_t BlockSize,
index_t
BBlockTransferSrcScalarPerVector
,
index_t
BBlockTransferDstScalarPerVector_K1
,
bool
BThreadTransferSrcResetCoordinateAfterRun
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
bool
ABlockLdsExtraM
,
bool
BBlockLdsExtraN
>
bool
BBlockLdsExtraN
,
index_t
MRepeatPerShuffle_CCopy
,
index_t
NRepeatPerShuffle_CCopy
,
index_t
MRepeatThread_CCopy
,
index_t
MThread_CCopy
,
index_t
NRepeatThread_CCopy
,
index_t
NThread_CCopy
,
index_t
NScalarPerVector_CCopy
>
struct
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -498,7 +502,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// shuffle and write out
{
#if
1
#if
0
// TODO: make it tunable
constexpr index_t MRepeatPerShuffle_CCopy = 1;
constexpr index_t NRepeatPerShuffle_CCopy = 1;
...
...
@@ -511,7 +515,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// vector length for blockwise copy from LDS to global
constexpr index_t NScalarPerVector_CCopy = 8;
#el
se
#el
if
0
// TODO: make it tunable
constexpr
index_t
MRepeatPerShuffle_CCopy
=
1
;
constexpr
index_t
NRepeatPerShuffle_CCopy
=
2
;
...
...
@@ -654,52 +658,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
n_thread_data_on_block_idx
[
I2
]),
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}};
#if 0
auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v4r1<
BlockSize, // index_t BlockSize,
ck::tensor_operation::element_wise::PassThrough, // SrcElementwiseOperation,
CElementwiseOperation, // DstElementwiseOperation,
CGlobalMemoryDataOperation, // DstInMemOp,
Sequence<1,
MRepeatPerShuffle_CCopy,
MPerBlock_CCopy,
1,
NRepeatPerShuffle_CCopy,
NPerBlock_CCopy>, // BlockSliceLengths,
Sequence<1,
MRepeatPerShuffle_CCopy,
MPerThread_CCopy,
1,
NRepeatPerShuffle_CCopy,
NPerThread_CCopy>, // ThreadSliceLengths,
Sequence<1,
MRepeatPerThread_CCopy,
MThread_CCopy,
1,
NRepeatPerThread_CCopy,
NThread_CCopy>, // ThreadClusterLengths,
Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder,
FloatC, // typename SrcData,
FloatC, // typename DstData,
decltype(c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl),
decltype(c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl),
Sequence<0, 1, 2, 3, 4, 5>, // typename SrcDimAccessOrder,
Sequence<0, 1, 2, 3, 4, 5>, // typename DstDimAccessOrder,
5, // index_t SrcVectorDim,
5, // index_t DstVectorDim,
NScalarPerVector_CCopy, // index_t SrcScalarPerVector,
NScalarPerVector_CCopy, // index_t DstScalarPerVector,
1, // index_t SrcScalarStrideInVector,
1, // index_t DstScalarStrideInVector,
true, // bool ThreadTransferSrcResetCoordinateAfterRun,
false> // bool ThreadTransferDstResetCoordinateAfterRun>
{c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
make_multi_index(0, 0, 0, 0, 0, 0),
ck::tensor_operation::element_wise::PassThrough{},
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
c_element_op};
#else
auto
c_block_copy_lds_to_global
=
BlockwiseTensorSliceTransfer_v6r1
<
BlockSize
,
// index_t BlockSize,
CElementwiseOperation
,
// ElementwiseOperation,
...
...
@@ -737,7 +695,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
,
make_multi_index
(
block_work_idx
[
I0
],
0
,
0
,
block_work_idx
[
I1
],
0
,
0
),
c_element_op
};
#endif
constexpr
auto
mrepeat_forward_step
=
make_multi_index
(
0
,
MRepeatPerShuffle_CCopy
,
0
,
0
,
0
,
0
);
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r3.hpp
View file @
681ede91
...
...
@@ -97,6 +97,7 @@ template <index_t BlockSize,
index_t
ABlockTransferSrcScalarPerVector
,
index_t
ABlockTransferDstScalarPerVector_K1
,
bool
AThreadTransferSrcResetCoordinateAfterRun
,
bool
ABlockLdsExtraM
,
typename
BBlockTransferThreadSliceLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
...
...
@@ -105,11 +106,14 @@ template <index_t BlockSize,
index_t
BBlockTransferSrcScalarPerVector
,
index_t
BBlockTransferDstScalarPerVector_K1
,
bool
BThreadTransferSrcResetCoordinateAfterRun
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
bool
ABlockLdsExtraM
,
bool
BBlockLdsExtraN
>
bool
BBlockLdsExtraN
,
index_t
MRepeatPerShuffle_CCopy
,
index_t
NRepeatPerShuffle_CCopy
,
index_t
MRepeatThread_CCopy
,
index_t
MThread_CCopy
,
index_t
NRepeatThread_CCopy
,
index_t
NThread_CCopy
,
index_t
NScalarPerVector_CCopy
>
struct
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -537,7 +541,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
// shuffle and write out
{
#if
1
#if
0
// TODO: make it tunable
constexpr index_t MRepeatPerShuffle_CCopy = 1;
constexpr index_t NRepeatPerShuffle_CCopy = 1;
...
...
@@ -550,7 +554,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
// vector length for blockwise copy from LDS to global
constexpr index_t NScalarPerVector_CCopy = 8;
#el
se
#el
if
0
// TODO: make it tunable
constexpr
index_t
MRepeatPerShuffle_CCopy
=
1
;
constexpr
index_t
NRepeatPerShuffle_CCopy
=
2
;
...
...
device_operation/include/device_conv2d_fwd_xdl_output_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
View file @
681ede91
...
...
@@ -41,6 +41,7 @@ template <typename InDataType,
ck
::
index_t
ABlockTransferSrcVectorDim
,
ck
::
index_t
ABlockTransferSrcScalarPerVector
,
ck
::
index_t
ABlockTransferDstScalarPerVector_K1
,
bool
ABlockLdsAddExtraM
,
typename
BBlockTransferThreadSliceLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
...
...
@@ -48,10 +49,14 @@ template <typename InDataType,
ck
::
index_t
BBlockTransferSrcVectorDim
,
ck
::
index_t
BBlockTransferSrcScalarPerVector
,
ck
::
index_t
BBlockTransferDstScalarPerVector_K1
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
bool
ABlockLdsAddExtraM
,
bool
BBlockLdsAddExtraN
>
bool
BBlockLdsAddExtraN
,
index_t
MRepeatPerShuffle_CCopy
,
index_t
NRepeatPerShuffle_CCopy
,
index_t
MRepeatThread_CCopy
,
index_t
MThread_CCopy
,
index_t
NRepeatThread_CCopy
,
index_t
NThread_CCopy
,
index_t
NScalarPerVector_CCopy
>
struct
DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
:
public
DeviceConvFwdBiasActivationAdd
<
InElementwiseOperation
,
...
...
@@ -260,6 +265,7 @@ struct
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
false
,
// AThreadTransferSrcResetCoordinateAfterRun,
ABlockLdsAddExtraM
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
Sequence
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder,
...
...
@@ -268,11 +274,14 @@ struct
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// BThreadTransferSrcResetCoordinateAfterRun,
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
// CThreadTransferSrcDstAccessOrder,
7
,
// CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector
,
ABlockLdsAddExtraM
,
BBlockLdsAddExtraN
>
;
BBlockLdsAddExtraN
,
MRepeatPerShuffle_CCopy
,
NRepeatPerShuffle_CCopy
,
MRepeatThread_CCopy
,
MThread_CCopy
,
NRepeatThread_CCopy
,
NThread_CCopy
,
NScalarPerVector_CCopy
>
;
// Argument
struct
Argument
:
public
BaseArgument
...
...
device_operation/include/device_conv2d_fwd_xdl_output_shuffle_nhwc_kyxc_nhwk.hpp
View file @
681ede91
...
...
@@ -40,6 +40,7 @@ template <typename InDataType,
ck
::
index_t
ABlockTransferSrcVectorDim
,
ck
::
index_t
ABlockTransferSrcScalarPerVector
,
ck
::
index_t
ABlockTransferDstScalarPerVector_K1
,
bool
ABlockLdsAddExtraM
,
typename
BBlockTransferThreadSliceLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterLengths_K0_N_K1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
...
...
@@ -47,10 +48,14 @@ template <typename InDataType,
ck
::
index_t
BBlockTransferSrcVectorDim
,
ck
::
index_t
BBlockTransferSrcScalarPerVector
,
ck
::
index_t
BBlockTransferDstScalarPerVector_K1
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
bool
ABlockLdsAddExtraM
,
bool
BBlockLdsAddExtraN
>
bool
BBlockLdsAddExtraN
,
index_t
MRepeatPerShuffle_CCopy
,
index_t
NRepeatPerShuffle_CCopy
,
index_t
MRepeatThread_CCopy
,
index_t
MThread_CCopy
,
index_t
NRepeatThread_CCopy
,
index_t
NThread_CCopy
,
index_t
NScalarPerVector_CCopy
>
struct
DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
:
public
DeviceConvFwd
<
InElementwiseOperation
,
WeiElementwiseOperation
,
OutElementwiseOperation
>
{
...
...
@@ -241,6 +246,7 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
false
,
// AThreadTransferSrcResetCoordinateAfterRun,
ABlockLdsAddExtraM
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
Sequence
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder,
...
...
@@ -249,11 +255,14 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
false
,
// BThreadTransferSrcResetCoordinateAfterRun,
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
// CThreadTransferSrcDstAccessOrder,
7
,
// CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector
,
ABlockLdsAddExtraM
,
BBlockLdsAddExtraN
>
;
BBlockLdsAddExtraN
,
MRepeatPerShuffle_CCopy
,
NRepeatPerShuffle_CCopy
,
MRepeatThread_CCopy
,
MThread_CCopy
,
NRepeatThread_CCopy
,
NThread_CCopy
,
NScalarPerVector_CCopy
>
;
// Argument
struct
Argument
:
public
BaseArgument
...
...
example/4_conv2d_fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
View file @
681ede91
...
...
@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough_v2;
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// clang-format off
//
##
| InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer|
CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds
|
//
##
| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar|
SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN
|
//
##
| | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1|
| PerVector
| | |
//
##
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
|
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
1
,
2
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
S
<
1
,
4
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
7
,
1
,
true
,
true
>
;
//
| InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|
ABlockLds|
BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer|
BBlockLds| MRepeatPer| NRepeatPer| MRepeat| MThread| NRepeat| NThread| NScalarPer
|
//
| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar|
AddExtraM|
ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar|
AddExtraN| Shuffle| Shuffle| Thread| _CCopy| Thread| _CCopy| Vector
|
//
| | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1|
|
Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1|
|
_CCopy| _CCopy| _CCopy
|
|
_CCopy
|
|
_CCopy
|
//
| | | | | | | | | | | | | | | | | | | | | | | |
|
| | | | | | |
|
|
|
|
|
| |
|
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
1
,
2
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
1
,
4
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
1
,
32
,
1
,
8
,
8
>
;
// clang-format on
template
<
typename
TIn
,
...
...
example/6_conv2d_fwd_xdl_output_shuffle_bias_relu_add/conv2d_fwd_xdl_output_shuffle_bias_relu_add.cpp
View file @
681ede91
...
...
@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd_v2;
// clang-format off
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer|
CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds
|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar|
SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN
|
// | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1|
| PerVector
| | |
// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
|
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
1
,
2
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
S
<
1
,
4
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
7
,
1
,
true
,
true
>
;
// | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|
ABlockLds|
BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer|
BBlockLds| MRepeatPer| NRepeatPer| MRepeat| MThread| NRepeat| NThread| NScalarPer
|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar|
AddExtraM|
ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar|
AddExtraN| Shuffle| Shuffle| Thread| _CCopy| Thread| _CCopy| Vector
|
// | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1|
|
Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1|
|
_CCopy| _CCopy| _CCopy
|
|
_CCopy
|
|
_CCopy
|
// | | | | | | | | | | | | | | | | | | | | | | | |
|
| | | | | | |
|
|
|
|
|
| |
|
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
1
,
2
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
1
,
4
,
8
>
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
1
,
32
,
1
,
8
,
8
>
;
// clang-format on
template
<
typename
TIn
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment