Commit 76871a6f authored by Qianfeng Zhang's avatar Qianfeng Zhang
Browse files

Move clear_tile(s_acc) for better interleaving

parent a979d030
......@@ -377,8 +377,6 @@ struct BlockFmhaPipelineQRKSVSAsync
}
else // executed by intermediate and last iteration
{
clear_tile(s_acc); // initialize C
if(i_total_loops < num_total_loop - 1)
{
move_tile_window(k_dram_window, {kN0, 0});
......@@ -390,6 +388,9 @@ struct BlockFmhaPipelineQRKSVSAsync
sequence<((i_k0 % NumKLdsBuffers) + 1) * kN0, kK0>{});
store_tile(k_lds_window_tmp, k_tiles[number<i_k0>{}]);
if constexpr(i_k0 == 0)
clear_tile(s_acc);
k_tiles[number<i_k0>{}] = load_tile(k_dram_window);
if constexpr(i_k0 < k0_loops - 1)
move_tile_window(k_dram_window, {0, kK0});
......@@ -413,6 +414,9 @@ struct BlockFmhaPipelineQRKSVSAsync
sequence<((i_k0 % NumKLdsBuffers) + 1) * kN0, kK0>{});
store_tile(k_lds_window_tmp, k_tiles[number<i_k0>{}]);
if constexpr(i_k0 == 0)
clear_tile(s_acc);
block_sync_lds();
// execute last unroll of gemm_0
gemm_0(s_acc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment