Commit 7aa10584 authored by Jing Zhang's avatar Jing Zhang
Browse files

add row/col instances

parent 552053e2
...@@ -34,95 +34,137 @@ template <class F> ...@@ -34,95 +34,137 @@ template <class F>
std::vector<Operation_Xdl_CShuffle> CreateOperationsImpl(F f, Layout ALayout, Layout BLayout) std::vector<Operation_Xdl_CShuffle> CreateOperationsImpl(F f, Layout ALayout, Layout BLayout)
{ {
std::vector<Operation_Xdl_CShuffle> result; std::vector<Operation_Xdl_CShuffle> result;
// Tile Desc: (block_size, m_per_block, n_per_block, k_per_block, ak1, bk1,
// m_per_XDL, n_per_XDL, m_Xdl_per_wave, n_Xdl_per_wave, num_gemmk_prefetch_stage)
std::vector<operation::TileDesc> tile_descriptions = { std::vector<operation::TileDesc> tile_descriptions = {
{256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 1}, // clang-format off
{256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 1}, // Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| NumGemmK|
{128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 1}, // Size| Block| Block| Block| | | XDL| XDL| Per| Per| Prefetch|
{256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 1}, // | | | | | | | | Wave| Wave| Stage|
{128, 128, 64, 32, 8, 8, 32, 32, 2, 2, 1}, // | | | | | | | | | | |
{128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 1}, { 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, 1},
{64, 64, 64, 32, 8, 8, 32, 32, 2, 2, 1}, { 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 1},
{256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 1}, { 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, 1},
{256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 1}, { 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 1},
{128, 128, 32, 32, 8, 8, 32, 32, 2, 1, 1}, { 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, 1},
{128, 32, 128, 32, 8, 8, 32, 32, 1, 2, 1}, { 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 1},
{64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 1}, { 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, 1},
{64, 32, 64, 32, 8, 8, 32, 32, 1, 2, 1}, { 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 1},
{ 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, 1},
{ 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, 1},
{ 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, 1},
{ 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 1},
{ 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, 1},
{ 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 1},
{ 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, 1},
{ 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 1},
// clang-format on
}; };
// BlockTransferDesc: (thread_cluster_length, thread_cluster_arrange_order, src_access_order, std::vector<operation::BlockTransferDesc> a_block_descriptions_rowmajor = {
// src_vec_dim, src_scalar_per_vector, dst_scalar_per_vector_k1, lds_add_extra_dim ) // clang-format off
auto ABlockTransferSrcVectorDim = ALayout == Layout::Column ? 1 : 2; // ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
std::vector<operation::BlockTransferDesc> a_block_descriptions = { // ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, // Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, // | | | | | | |
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
// clang-format on
}; };
auto BBlockTransferSrcVectorDim = BLayout == Layout::Row ? 1 : 2; std::vector<operation::BlockTransferDesc> b_block_descriptions_rowmajor = {
std::vector<operation::BlockTransferDesc> b_block_descriptions = { // clang-format off
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, // BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, // ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, // Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, // | | | | | | |
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 8, 8, 1}, { S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{ S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
{ S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0},
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
// clang-format on
}; };
// cshuffle_descriptions: (m_Xdl_per_wave_per_shuffle, n_Xdl_per_wave_per_shuffle)
std::vector<operation::CShuffleDesc> cshuffle_descriptions = { std::vector<operation::CShuffleDesc> cshuffle_descriptions = {
{1, 1}, // clang-format off
{1, 1}, // CShuffle| CShuffle|
{1, 1}, // MXdlPerWave| NXdlPerWave|
{1, 1}, // PerShuffle| PerShuffle|
{1, 1}, // | |
{1, 1}, { 1, 1},
{1, 1}, { 1, 1},
{1, 1}, { 1, 1},
{1, 1}, { 1, 1},
{1, 1}, { 1, 1},
{1, 1}, { 1, 1},
{1, 1}, { 1, 1},
{1, 1}, { 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
// clang-format on
}; };
// CBlockTransferDesc: (cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl,
// scalar_per_vector_n_wave_n_per_Xdl)
std::vector<operation::CBlockTransferDesc> c_block_descriptions = { std::vector<operation::CBlockTransferDesc> c_block_descriptions = {
{S<1, 32, 1, 8>, 8}, // clang-format off
{S<1, 32, 1, 8>, 8}, // CBlockTransferClusterLengths| CBlockTransfer
{S<1, 16, 1, 8>, 8}, // _MBlock_MWaveMPerXdl| ScalarPerVector
{S<1, 32, 1, 8>, 8}, // _NBlock_NWaveNPerXdl| _NWaveNPerXdl
{S<1, 32, 1, 4>, 8}, // |
{S<1, 16, 1, 8>, 8}, { S<1, 32, 1, 8>, 8},
{S<1, 16, 1, 4>, 8}, { S<1, 32, 1, 8>, 8},
{S<1, 32, 1, 8>, 8}, { S<1, 32, 1, 8>, 8},
{S<1, 32, 1, 8>, 8}, { S<1, 32, 1, 8>, 8},
{S<1, 32, 1, 4>, 8}, { S<1, 16, 1, 8>, 8},
{S<1, 16, 1, 8>, 8}, { S<1, 16, 1, 8>, 8},
{S<1, 16, 1, 4>, 8}, { S<1, 32, 1, 8>, 8},
{S<1, 16, 1, 4>, 8}, { S<1, 32, 1, 8>, 8},
{ S<1, 32, 1, 4>, 8},
{ S<1, 32, 1, 4>, 8},
{ S<1, 16, 1, 8>, 8},
{ S<1, 16, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
// clang-format on
}; };
const auto a_block_descriptions =
(ALayout == Layout::Row) ? a_block_descriptions_rowmajor : b_block_descriptions_rowmajor;
const auto b_block_descriptions =
(BLayout == Layout::Row) ? b_block_descriptions_rowmajor : a_block_descriptions_rowmajor;
assert(tile_descriptions.size() == a_block_descriptions.size()); assert(tile_descriptions.size() == a_block_descriptions.size());
assert(tile_descriptions.size() == b_block_descriptions.size()); assert(tile_descriptions.size() == b_block_descriptions.size());
assert(tile_descriptions.size() == cshuffle_descriptions.size()); assert(tile_descriptions.size() == cshuffle_descriptions.size());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment