Commit e86f3769 authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent 2488092e
...@@ -40,47 +40,47 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa ...@@ -40,47 +40,47 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off // clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
<ALayout, // typename ALayout, <ALayout, // typename ALayout
BLayout, // typename BLayout, BLayout, // typename BLayout
CLayout, // typename CLayout, CLayout, // typename CLayout
ADataType, // typename ADataType, ADataType, // typename ADataType
BDataType, // typename BDataType, BDataType, // typename BDataType
CDataType, // typename CDataType, CDataType, // typename CDataType
AccDataType, // typename GemmAccDataType, AccDataType, // typename GemmAccDataType
CDataType, // typename CShuffleDataType, CDataType, // typename CShuffleDataType
PassThrough, // typename AElementwiseOperation, PassThrough, // typename AElementwiseOperation
PassThrough, // typename BElementwiseOperation, PassThrough, // typename BElementwiseOperation
PassThrough, // typename CElementwiseOperation, PassThrough, // typename CElementwiseOperation
GemmDefault, // GemmSpecialization GemmSpec, GemmDefault, // GemmSpecialization GemmSpec
1, // index_t NumGemmKPrefetchStage, 1, // index_t NumGemmKPrefetchStage
256, // index_t BlockSize, 256, // index_t BlockSize
256, // index_t MPerBlock, 256, // index_t MPerBlock
128, // index_t NPerBlock, 128, // index_t NPerBlock
32, // index_t KPerBlock, 32, // index_t KPerBlock
8, // index_t AK1, 8, // index_t AK1
8, // index_t BK1, 8, // index_t BK1
32, // index_t MPerXDL, 32, // index_t MPerXDL
32, // index_t NPerXDL, 32, // index_t NPerXDL
4, // index_t MXdlPerWave, 4, // index_t MXdlPerWave
2, // index_t NXdlPerWave, 2, // index_t NXdlPerWave
S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1, S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder, S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder, S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder
2, // index_t ABlockTransferSrcVectorDim, 2, // index_t ABlockTransferSrcVectorDim
8, // index_t ABlockTransferSrcScalarPerVector, 8, // index_t ABlockTransferSrcScalarPerVector
8, // index_t ABlockTransferDstScalarPerVector_AK1, 8, // index_t ABlockTransferDstScalarPerVector_AK1
1, // bool ABlockLdsExtraM, 1, // index_t ABlockLdsExtraM
S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1, S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder, S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder, S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder
2, // index_t BBlockTransferSrcVectorDim, 2, // index_t BBlockTransferSrcVectorDim
8, // index_t BBlockTransferSrcScalarPerVector, 8, // index_t BBlockTransferSrcScalarPerVector
8, // index_t BBlockTransferDstScalarPerVector_BK1, 8, // index_t BBlockTransferDstScalarPerVector_BK1
1, // bool BBlockLdsExtraN, 1, // index_t BBlockLdsExtraN
1, // index_t CShuffleMXdlPerWavePerShuffle, 1, // index_t CShuffleMXdlPerWavePerShuffle
1, // index_t CShuffleNXdlPerWavePerShuffle, 1, // index_t CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock> 8>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
// clang-format on // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
......
...@@ -42,8 +42,7 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -42,8 +42,7 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// clang-format off // clang-format off
#if 1 #if 1
...@@ -53,28 +52,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle ...@@ -53,28 +52,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< Row, Col, Row, F16, F16, F16, F32, F32, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; < Row, Col, Row, F16, F16, F16, F32, F32, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// // 1-stage prefetch #elif 0
// < Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// // 2-stage prefetch
// < Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 2, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
#elif 1
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_ProducerConsumer_CShuffle using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_ProducerConsumer_CShuffle
//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| ABBlockTransfer| BlockGemm| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| //######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| ABBlockTransfer| BlockGemm| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| //######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// < Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 8>, 8>;
< Row, Col, Row, F16, F16, F16, F32, F16, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 8>, 8>; < Row, Col, Row, F16, F16, F16, F32, F16, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 8>, 8>;
#elif 1 #elif 0
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| //######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
//######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>; < F16, F16, F16, F32, Row, Col, Row, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>;
// < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>;
// < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 256, 128, 144, 8, 8, 16, 16, 2, 9, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 8, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>;
// < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 256, 128, 144, 4, 8, 16, 16, 2, 9, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>;
#endif #endif
// clang-format on // clang-format on
......
...@@ -35,47 +35,47 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa ...@@ -35,47 +35,47 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off // clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle< using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
ALayout, // typename ALayout, ALayout, // typename ALayout
BLayout, // typename BLayout, BLayout, // typename BLayout
CLayout, // typename CLayout, CLayout, // typename CLayout
ADataType, // typename ADataType, ADataType, // typename ADataType
BDataType, // typename BDataType, BDataType, // typename BDataType
CDataType, // typename CDataType, CDataType, // typename CDataType
AccDataType, // typename GemmAccDataType, AccDataType, // typename GemmAccDataType
CShuffleDataType, // typename CShuffleDataType, CShuffleDataType, // typename CShuffleDataType
PassThrough, // typename AElementwiseOperation, PassThrough, // typename AElementwiseOperation
PassThrough, // typename BElementwiseOperation, PassThrough, // typename BElementwiseOperation
PassThrough, // typename CElementwiseOperation, PassThrough, // typename CElementwiseOperation
GemmDefault, // GemmSpecialization GemmSpec, GemmDefault, // GemmSpecialization GemmSpec
1, // index_t NumGemmKPrefetchStage, 1, // index_t NumGemmKPrefetchStage
256, // index_t BlockSize, 256, // index_t BlockSize
256, // index_t MPerBlock, 256, // index_t MPerBlock
128, // index_t NPerBlock, 128, // index_t NPerBlock
64, // index_t KPerBlock, 64, // index_t KPerBlock
16, // index_t AK1, 16, // index_t AK1
16, // index_t BK1, 16, // index_t BK1
32, // index_t MPerXDL, 32, // index_t MPerXDL
32, // index_t NPerXDL, 32, // index_t NPerXDL
4, // index_t MXdlPerWave, 4, // index_t MXdlPerWave
2, // index_t NXdlPerWave, 2, // index_t NXdlPerWave
S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1, S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder, S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder, S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder
2, // index_t ABlockTransferSrcVectorDim, 2, // index_t ABlockTransferSrcVectorDim
16, // index_t ABlockTransferSrcScalarPerVector, 16, // index_t ABlockTransferSrcScalarPerVector
16, // index_t ABlockTransferDstScalarPerVector_AK1, 16, // index_t ABlockTransferDstScalarPerVector_AK1
1, // bool ABlockLdsExtraM, 1, // index_t ABlockLdsExtraM
S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1, S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder, S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder, S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder
2, // index_t BBlockTransferSrcVectorDim, 2, // index_t BBlockTransferSrcVectorDim
8, // index_t BBlockTransferSrcScalarPerVector, 8, // index_t BBlockTransferSrcScalarPerVector
8, // index_t BBlockTransferDstScalarPerVector_BK1, 8, // index_t BBlockTransferDstScalarPerVector_BK1
1, // bool BBlockLdsExtraN, 1, // index_t BBlockLdsExtraN
1, // index_t CShuffleMXdlPerWavePerShuffle, 1, // index_t CShuffleMXdlPerWavePerShuffle
1, // index_t CShuffleNXdlPerWavePerShuffle, 1, // index_t CShuffleNXdlPerWavePerShuffle
S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock> 16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
// clang-format on // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment