Commit ea2c27ca authored by ltqin's avatar ltqin
Browse files

add limit for CDE0BlockTransferSrcScalarPerVector

parent fc50b641
...@@ -734,6 +734,10 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle ...@@ -734,6 +734,10 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
const auto wave_id = GetGemm0WaveIdx(); const auto wave_id = GetGemm0WaveIdx();
const auto wave_m_n_id = GetGemm0WaveMNIdx(wave_id[I2]); // I2: 0~63 const auto wave_m_n_id = GetGemm0WaveMNIdx(wave_id[I2]); // I2: 0~63
static_assert(CDE0BlockTransferSrcScalarPerVector <= n4,
"vector load must be not greater than n4");
static_assert(n4 % CDE0BlockTransferSrcScalarPerVector == 0);
auto d0s_threadwise_copy = generate_tuple( auto d0s_threadwise_copy = generate_tuple(
[&](auto i) { [&](auto i) {
return ThreadwiseTensorSliceTransfer_v2< return ThreadwiseTensorSliceTransfer_v2<
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment