Commit 681ede91 authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent edb1d2c3
...@@ -83,6 +83,7 @@ template <index_t BlockSize, ...@@ -83,6 +83,7 @@ template <index_t BlockSize,
index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferSrcScalarPerVector,
index_t ABlockTransferDstScalarPerVector_K1, index_t ABlockTransferDstScalarPerVector_K1,
bool AThreadTransferSrcResetCoordinateAfterRun, bool AThreadTransferSrcResetCoordinateAfterRun,
bool ABlockLdsExtraM,
typename BBlockTransferThreadSliceLengths_K0_N_K1, typename BBlockTransferThreadSliceLengths_K0_N_K1,
typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferThreadClusterArrangeOrder,
...@@ -91,11 +92,14 @@ template <index_t BlockSize, ...@@ -91,11 +92,14 @@ template <index_t BlockSize,
index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector,
index_t BBlockTransferDstScalarPerVector_K1, index_t BBlockTransferDstScalarPerVector_K1,
bool BThreadTransferSrcResetCoordinateAfterRun, bool BThreadTransferSrcResetCoordinateAfterRun,
typename CThreadTransferSrcDstAccessOrder, bool BBlockLdsExtraN,
index_t CThreadTransferSrcDstVectorDim, index_t MRepeatPerShuffle_CCopy,
index_t CThreadTransferDstScalarPerVector, index_t NRepeatPerShuffle_CCopy,
bool ABlockLdsExtraM, index_t MRepeatThread_CCopy,
bool BBlockLdsExtraN> index_t MThread_CCopy,
index_t NRepeatThread_CCopy,
index_t NThread_CCopy,
index_t NScalarPerVector_CCopy>
struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
{ {
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
...@@ -498,7 +502,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -498,7 +502,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// shuffle and write out // shuffle and write out
{ {
#if 1 #if 0
// TODO: make it tunable // TODO: make it tunable
constexpr index_t MRepeatPerShuffle_CCopy = 1; constexpr index_t MRepeatPerShuffle_CCopy = 1;
constexpr index_t NRepeatPerShuffle_CCopy = 1; constexpr index_t NRepeatPerShuffle_CCopy = 1;
...@@ -511,7 +515,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -511,7 +515,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// vector length for blockwise copy from LDS to global // vector length for blockwise copy from LDS to global
constexpr index_t NScalarPerVector_CCopy = 8; constexpr index_t NScalarPerVector_CCopy = 8;
#else #elif 0
// TODO: make it tunable // TODO: make it tunable
constexpr index_t MRepeatPerShuffle_CCopy = 1; constexpr index_t MRepeatPerShuffle_CCopy = 1;
constexpr index_t NRepeatPerShuffle_CCopy = 2; constexpr index_t NRepeatPerShuffle_CCopy = 2;
...@@ -654,53 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -654,53 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
n_thread_data_on_block_idx[I2]), n_thread_data_on_block_idx[I2]),
ck::tensor_operation::element_wise::PassThrough{}}; ck::tensor_operation::element_wise::PassThrough{}};
#if 0 auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1<
auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v4r1<
BlockSize, // index_t BlockSize,
ck::tensor_operation::element_wise::PassThrough, // SrcElementwiseOperation,
CElementwiseOperation, // DstElementwiseOperation,
CGlobalMemoryDataOperation, // DstInMemOp,
Sequence<1,
MRepeatPerShuffle_CCopy,
MPerBlock_CCopy,
1,
NRepeatPerShuffle_CCopy,
NPerBlock_CCopy>, // BlockSliceLengths,
Sequence<1,
MRepeatPerShuffle_CCopy,
MPerThread_CCopy,
1,
NRepeatPerShuffle_CCopy,
NPerThread_CCopy>, // ThreadSliceLengths,
Sequence<1,
MRepeatPerThread_CCopy,
MThread_CCopy,
1,
NRepeatPerThread_CCopy,
NThread_CCopy>, // ThreadClusterLengths,
Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder,
FloatC, // typename SrcData,
FloatC, // typename DstData,
decltype(c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl),
decltype(c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl),
Sequence<0, 1, 2, 3, 4, 5>, // typename SrcDimAccessOrder,
Sequence<0, 1, 2, 3, 4, 5>, // typename DstDimAccessOrder,
5, // index_t SrcVectorDim,
5, // index_t DstVectorDim,
NScalarPerVector_CCopy, // index_t SrcScalarPerVector,
NScalarPerVector_CCopy, // index_t DstScalarPerVector,
1, // index_t SrcScalarStrideInVector,
1, // index_t DstScalarStrideInVector,
true, // bool ThreadTransferSrcResetCoordinateAfterRun,
false> // bool ThreadTransferDstResetCoordinateAfterRun>
{c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
make_multi_index(0, 0, 0, 0, 0, 0),
ck::tensor_operation::element_wise::PassThrough{},
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
c_element_op};
#else
auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1<
BlockSize, // index_t BlockSize, BlockSize, // index_t BlockSize,
CElementwiseOperation, // ElementwiseOperation, CElementwiseOperation, // ElementwiseOperation,
CGlobalMemoryDataOperation, // DstInMemOp, CGlobalMemoryDataOperation, // DstInMemOp,
...@@ -730,14 +688,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -730,14 +688,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
Sequence<0, 1, 2, 3, 4, 5>, // typename DimAccessOrder, Sequence<0, 1, 2, 3, 4, 5>, // typename DimAccessOrder,
5, // index_t VectorDim, 5, // index_t VectorDim,
NScalarPerVector_CCopy, // index_t ScalarPerVector, NScalarPerVector_CCopy, // index_t ScalarPerVector,
true, // bool ThreadTransferSrcResetCoordinateAfterRun, true, // bool ThreadTransferSrcResetCoordinateAfterRun,
false> // bool ThreadTransferDstResetCoordinateAfterRun> false> // bool ThreadTransferDstResetCoordinateAfterRun>
{c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl, {c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
make_multi_index(0, 0, 0, 0, 0, 0), make_multi_index(0, 0, 0, 0, 0, 0),
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl, c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0), make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
c_element_op}; c_element_op};
#endif
constexpr auto mrepeat_forward_step = constexpr auto mrepeat_forward_step =
make_multi_index(0, MRepeatPerShuffle_CCopy, 0, 0, 0, 0); make_multi_index(0, MRepeatPerShuffle_CCopy, 0, 0, 0, 0);
......
...@@ -97,6 +97,7 @@ template <index_t BlockSize, ...@@ -97,6 +97,7 @@ template <index_t BlockSize,
index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferSrcScalarPerVector,
index_t ABlockTransferDstScalarPerVector_K1, index_t ABlockTransferDstScalarPerVector_K1,
bool AThreadTransferSrcResetCoordinateAfterRun, bool AThreadTransferSrcResetCoordinateAfterRun,
bool ABlockLdsExtraM,
typename BBlockTransferThreadSliceLengths_K0_N_K1, typename BBlockTransferThreadSliceLengths_K0_N_K1,
typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferThreadClusterArrangeOrder,
...@@ -105,11 +106,14 @@ template <index_t BlockSize, ...@@ -105,11 +106,14 @@ template <index_t BlockSize,
index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector,
index_t BBlockTransferDstScalarPerVector_K1, index_t BBlockTransferDstScalarPerVector_K1,
bool BThreadTransferSrcResetCoordinateAfterRun, bool BThreadTransferSrcResetCoordinateAfterRun,
typename CThreadTransferSrcDstAccessOrder, bool BBlockLdsExtraN,
index_t CThreadTransferSrcDstVectorDim, index_t MRepeatPerShuffle_CCopy,
index_t CThreadTransferDstScalarPerVector, index_t NRepeatPerShuffle_CCopy,
bool ABlockLdsExtraM, index_t MRepeatThread_CCopy,
bool BBlockLdsExtraN> index_t MThread_CCopy,
index_t NRepeatThread_CCopy,
index_t NThread_CCopy,
index_t NScalarPerVector_CCopy>
struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
{ {
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
...@@ -537,7 +541,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 ...@@ -537,7 +541,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
// shuffle and write out // shuffle and write out
{ {
#if 1 #if 0
// TODO: make it tunable // TODO: make it tunable
constexpr index_t MRepeatPerShuffle_CCopy = 1; constexpr index_t MRepeatPerShuffle_CCopy = 1;
constexpr index_t NRepeatPerShuffle_CCopy = 1; constexpr index_t NRepeatPerShuffle_CCopy = 1;
...@@ -550,7 +554,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 ...@@ -550,7 +554,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
// vector length for blockwise copy from LDS to global // vector length for blockwise copy from LDS to global
constexpr index_t NScalarPerVector_CCopy = 8; constexpr index_t NScalarPerVector_CCopy = 8;
#else #elif 0
// TODO: make it tunable // TODO: make it tunable
constexpr index_t MRepeatPerShuffle_CCopy = 1; constexpr index_t MRepeatPerShuffle_CCopy = 1;
constexpr index_t NRepeatPerShuffle_CCopy = 2; constexpr index_t NRepeatPerShuffle_CCopy = 2;
......
...@@ -41,6 +41,7 @@ template <typename InDataType, ...@@ -41,6 +41,7 @@ template <typename InDataType,
ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1, ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadSliceLengths_K0_N_K1, typename BBlockTransferThreadSliceLengths_K0_N_K1,
typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferThreadClusterArrangeOrder,
...@@ -48,10 +49,14 @@ template <typename InDataType, ...@@ -48,10 +49,14 @@ template <typename InDataType,
ck::index_t BBlockTransferSrcVectorDim, ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector, ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1, ck::index_t BBlockTransferDstScalarPerVector_K1,
ck::index_t CThreadTransferSrcDstVectorDim, bool BBlockLdsAddExtraN,
ck::index_t CThreadTransferDstScalarPerVector, index_t MRepeatPerShuffle_CCopy,
bool ABlockLdsAddExtraM, index_t NRepeatPerShuffle_CCopy,
bool BBlockLdsAddExtraN> index_t MRepeatThread_CCopy,
index_t MThread_CCopy,
index_t NRepeatThread_CCopy,
index_t NThread_CCopy,
index_t NScalarPerVector_CCopy>
struct struct
DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
: public DeviceConvFwdBiasActivationAdd<InElementwiseOperation, : public DeviceConvFwdBiasActivationAdd<InElementwiseOperation,
...@@ -260,6 +265,7 @@ struct ...@@ -260,6 +265,7 @@ struct
ABlockTransferSrcScalarPerVector, ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1, ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun, false, // AThreadTransferSrcResetCoordinateAfterRun,
ABlockLdsAddExtraM,
BBlockTransferThreadSliceLengths_K0_N_K1, BBlockTransferThreadSliceLengths_K0_N_K1,
BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterLengths_K0_N_K1,
Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder, Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
...@@ -267,12 +273,15 @@ struct ...@@ -267,12 +273,15 @@ struct
2, // BBlockTransferSrcVectorDim, 2, // BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1, BBlockTransferDstScalarPerVector_K1,
false, // BThreadTransferSrcResetCoordinateAfterRun, false, // BThreadTransferSrcResetCoordinateAfterRun,
Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder, BBlockLdsAddExtraN,
7, // CThreadTransferSrcDstVectorDim, MRepeatPerShuffle_CCopy,
CThreadTransferDstScalarPerVector, NRepeatPerShuffle_CCopy,
ABlockLdsAddExtraM, MRepeatThread_CCopy,
BBlockLdsAddExtraN>; MThread_CCopy,
NRepeatThread_CCopy,
NThread_CCopy,
NScalarPerVector_CCopy>;
// Argument // Argument
struct Argument : public BaseArgument struct Argument : public BaseArgument
......
...@@ -40,6 +40,7 @@ template <typename InDataType, ...@@ -40,6 +40,7 @@ template <typename InDataType,
ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t ABlockTransferDstScalarPerVector_K1, ck::index_t ABlockTransferDstScalarPerVector_K1,
bool ABlockLdsAddExtraM,
typename BBlockTransferThreadSliceLengths_K0_N_K1, typename BBlockTransferThreadSliceLengths_K0_N_K1,
typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterLengths_K0_N_K1,
typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferThreadClusterArrangeOrder,
...@@ -47,10 +48,14 @@ template <typename InDataType, ...@@ -47,10 +48,14 @@ template <typename InDataType,
ck::index_t BBlockTransferSrcVectorDim, ck::index_t BBlockTransferSrcVectorDim,
ck::index_t BBlockTransferSrcScalarPerVector, ck::index_t BBlockTransferSrcScalarPerVector,
ck::index_t BBlockTransferDstScalarPerVector_K1, ck::index_t BBlockTransferDstScalarPerVector_K1,
ck::index_t CThreadTransferSrcDstVectorDim, bool BBlockLdsAddExtraN,
ck::index_t CThreadTransferDstScalarPerVector, index_t MRepeatPerShuffle_CCopy,
bool ABlockLdsAddExtraM, index_t NRepeatPerShuffle_CCopy,
bool BBlockLdsAddExtraN> index_t MRepeatThread_CCopy,
index_t MThread_CCopy,
index_t NRepeatThread_CCopy,
index_t NThread_CCopy,
index_t NScalarPerVector_CCopy>
struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
: public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation> : public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
{ {
...@@ -241,6 +246,7 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N ...@@ -241,6 +246,7 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N
ABlockTransferSrcScalarPerVector, ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1, ABlockTransferDstScalarPerVector_K1,
false, // AThreadTransferSrcResetCoordinateAfterRun, false, // AThreadTransferSrcResetCoordinateAfterRun,
ABlockLdsAddExtraM,
BBlockTransferThreadSliceLengths_K0_N_K1, BBlockTransferThreadSliceLengths_K0_N_K1,
BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterLengths_K0_N_K1,
Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder, Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
...@@ -248,12 +254,15 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N ...@@ -248,12 +254,15 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N
2, // BBlockTransferSrcVectorDim, 2, // BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1, BBlockTransferDstScalarPerVector_K1,
false, // BThreadTransferSrcResetCoordinateAfterRun, false, // BThreadTransferSrcResetCoordinateAfterRun,
Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder, BBlockLdsAddExtraN,
7, // CThreadTransferSrcDstVectorDim, MRepeatPerShuffle_CCopy,
CThreadTransferDstScalarPerVector, NRepeatPerShuffle_CCopy,
ABlockLdsAddExtraM, MRepeatThread_CCopy,
BBlockLdsAddExtraN>; MThread_CCopy,
NRepeatThread_CCopy,
NThread_CCopy,
NScalarPerVector_CCopy>;
// Argument // Argument
struct Argument : public BaseArgument struct Argument : public BaseArgument
......
...@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough_v2; ...@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough_v2;
using DeviceConvFwdInstance = ck::tensor_operation::device:: using DeviceConvFwdInstance = ck::tensor_operation::device::
DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// clang-format off // clang-format off
//##| InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds| // | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| MRepeatPer| NRepeatPer| MRepeat| MThread| NRepeat| NThread| NScalarPer|
//##| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN| // | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| Shuffle| Shuffle| Thread| _CCopy| Thread| _CCopy| Vector|
//##| | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | | // | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | _CCopy| _CCopy| _CCopy| | _CCopy| | _CCopy|
//##| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<1, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, S<1, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 7, 1, true, true>; <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<1, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<1, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, 1, 32, 1, 8, 8>;
// clang-format on // clang-format on
template <typename TIn, template <typename TIn,
......
...@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd_v2; ...@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd_v2;
// clang-format off // clang-format off
using DeviceConvFwdInstance = ck::tensor_operation::device:: using DeviceConvFwdInstance = ck::tensor_operation::device::
DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds| // | InData| WeiData| OutData| AccData| In| Wei| Out| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| MRepeatPer| NRepeatPer| MRepeat| MThread| NRepeat| NThread| NScalarPer|
// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| SrcDstVectorDim| DstScalar| AddExtraM| AddExtraN| // | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadSlice| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| Shuffle| Shuffle| Thread| _CCopy| Thread| _CCopy| Vector|
// | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| | | // | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | _CCopy| _CCopy| _CCopy| | _CCopy| | _CCopy|
// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<1, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, S<1, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 7, 1, true, true>; <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<1, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<1, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, 1, 32, 1, 8, 8>;
// clang-format on // clang-format on
template <typename TIn, template <typename TIn,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment