Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7aa10584
Commit
7aa10584
authored
Nov 06, 2023
by
Jing Zhang
Browse files
add row/col instances
parent
552053e2
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
118 additions
and
76 deletions
+118
-76
host/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
host/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
+118
-76
No files found.
host/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
View file @
7aa10584
...
@@ -34,95 +34,137 @@ template <class F>
...
@@ -34,95 +34,137 @@ template <class F>
std
::
vector
<
Operation_Xdl_CShuffle
>
CreateOperationsImpl
(
F
f
,
Layout
ALayout
,
Layout
BLayout
)
std
::
vector
<
Operation_Xdl_CShuffle
>
CreateOperationsImpl
(
F
f
,
Layout
ALayout
,
Layout
BLayout
)
{
{
std
::
vector
<
Operation_Xdl_CShuffle
>
result
;
std
::
vector
<
Operation_Xdl_CShuffle
>
result
;
// Tile Desc: (block_size, m_per_block, n_per_block, k_per_block, ak1, bk1,
// m_per_XDL, n_per_XDL, m_Xdl_per_wave, n_Xdl_per_wave, num_gemmk_prefetch_stage)
std
::
vector
<
operation
::
TileDesc
>
tile_descriptions
=
{
std
::
vector
<
operation
::
TileDesc
>
tile_descriptions
=
{
{
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
1
},
// clang-format off
{
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
2
,
4
,
1
},
// Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| NumGemmK|
{
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
1
},
// Size| Block| Block| Block| | | XDL| XDL| Per| Per| Prefetch|
{
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
// | | | | | | | | Wave| Wave| Stage|
{
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
// | | | | | | | | | | |
{
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
{
256
,
256
,
128
,
32
,
8
,
2
,
32
,
32
,
4
,
2
,
1
},
{
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
{
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
1
},
{
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
1
},
{
256
,
128
,
256
,
32
,
8
,
2
,
32
,
32
,
2
,
4
,
1
},
{
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
1
},
{
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
2
,
4
,
1
},
{
128
,
128
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
1
},
{
128
,
128
,
128
,
32
,
8
,
2
,
32
,
32
,
4
,
2
,
1
},
{
128
,
32
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
1
},
{
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
1
},
{
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
1
},
{
256
,
128
,
128
,
32
,
8
,
2
,
32
,
32
,
2
,
2
,
1
},
{
64
,
32
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
1
},
{
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
{
128
,
128
,
64
,
32
,
8
,
2
,
32
,
32
,
2
,
2
,
1
},
{
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
{
128
,
64
,
128
,
32
,
8
,
2
,
32
,
32
,
2
,
2
,
1
},
{
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
1
},
{
256
,
128
,
64
,
32
,
8
,
2
,
32
,
32
,
2
,
1
,
1
},
{
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
1
},
{
256
,
64
,
128
,
32
,
8
,
2
,
32
,
32
,
1
,
2
,
1
},
{
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
1
},
// clang-format on
};
};
// BlockTransferDesc: (thread_cluster_length, thread_cluster_arrange_order, src_access_order,
std
::
vector
<
operation
::
BlockTransferDesc
>
a_block_descriptions_rowmajor
=
{
// src_vec_dim, src_scalar_per_vector, dst_scalar_per_vector_k1, lds_add_extra_dim )
// clang-format off
auto
ABlockTransferSrcVectorDim
=
ALayout
==
Layout
::
Column
?
1
:
2
;
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
std
::
vector
<
operation
::
BlockTransferDesc
>
a_block_descriptions
=
{
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
// | | | | | | |
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
},
// clang-format on
};
};
auto
BBlockTransferSrcVectorDim
=
BLayout
==
Layout
::
Row
?
1
:
2
;
std
::
vector
<
operation
::
BlockTransferDesc
>
b_block_descriptions_rowmajor
=
{
std
::
vector
<
operation
::
BlockTransferDesc
>
b_block_descriptions
=
{
// clang-format off
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
// BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
// | | | | | | |
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
},
{
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
8
,
8
,
1
},
{
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
},
{
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
},
{
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
},
{
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
},
{
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
},
// clang-format on
};
};
// cshuffle_descriptions: (m_Xdl_per_wave_per_shuffle, n_Xdl_per_wave_per_shuffle)
std
::
vector
<
operation
::
CShuffleDesc
>
cshuffle_descriptions
=
{
std
::
vector
<
operation
::
CShuffleDesc
>
cshuffle_descriptions
=
{
{
1
,
1
},
// clang-format off
{
1
,
1
},
// CShuffle| CShuffle|
{
1
,
1
},
// MXdlPerWave| NXdlPerWave|
{
1
,
1
},
// PerShuffle| PerShuffle|
{
1
,
1
},
// | |
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
// clang-format on
};
};
// CBlockTransferDesc: (cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl,
// scalar_per_vector_n_wave_n_per_Xdl)
std
::
vector
<
operation
::
CBlockTransferDesc
>
c_block_descriptions
=
{
std
::
vector
<
operation
::
CBlockTransferDesc
>
c_block_descriptions
=
{
{
S
<
1
,
32
,
1
,
8
>
,
8
},
// clang-format off
{
S
<
1
,
32
,
1
,
8
>
,
8
},
// CBlockTransferClusterLengths| CBlockTransfer
{
S
<
1
,
16
,
1
,
8
>
,
8
},
// _MBlock_MWaveMPerXdl| ScalarPerVector
{
S
<
1
,
32
,
1
,
8
>
,
8
},
// _NBlock_NWaveNPerXdl| _NWaveNPerXdl
{
S
<
1
,
32
,
1
,
4
>
,
8
},
// |
{
S
<
1
,
16
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
4
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
4
>
,
8
},
{
S
<
1
,
16
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
4
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
4
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
4
>
,
8
},
{
S
<
1
,
32
,
1
,
4
>
,
8
},
{
S
<
1
,
16
,
1
,
8
>
,
8
},
{
S
<
1
,
16
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
{
S
<
1
,
32
,
1
,
8
>
,
8
},
// clang-format on
};
};
const
auto
a_block_descriptions
=
(
ALayout
==
Layout
::
Row
)
?
a_block_descriptions_rowmajor
:
b_block_descriptions_rowmajor
;
const
auto
b_block_descriptions
=
(
BLayout
==
Layout
::
Row
)
?
b_block_descriptions_rowmajor
:
a_block_descriptions_rowmajor
;
assert
(
tile_descriptions
.
size
()
==
a_block_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
a_block_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
b_block_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
b_block_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
cshuffle_descriptions
.
size
());
assert
(
tile_descriptions
.
size
()
==
cshuffle_descriptions
.
size
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment