Commit 76871a6f authored by Qianfeng Zhang's avatar Qianfeng Zhang
Browse files

Move clear_tile(s_acc) for better interleaving

parent a979d030
...@@ -377,8 +377,6 @@ struct BlockFmhaPipelineQRKSVSAsync ...@@ -377,8 +377,6 @@ struct BlockFmhaPipelineQRKSVSAsync
} }
else // executed by intermediate and last iteration else // executed by intermediate and last iteration
{ {
clear_tile(s_acc); // initialize C
if(i_total_loops < num_total_loop - 1) if(i_total_loops < num_total_loop - 1)
{ {
move_tile_window(k_dram_window, {kN0, 0}); move_tile_window(k_dram_window, {kN0, 0});
...@@ -390,6 +388,9 @@ struct BlockFmhaPipelineQRKSVSAsync ...@@ -390,6 +388,9 @@ struct BlockFmhaPipelineQRKSVSAsync
sequence<((i_k0 % NumKLdsBuffers) + 1) * kN0, kK0>{}); sequence<((i_k0 % NumKLdsBuffers) + 1) * kN0, kK0>{});
store_tile(k_lds_window_tmp, k_tiles[number<i_k0>{}]); store_tile(k_lds_window_tmp, k_tiles[number<i_k0>{}]);
if constexpr(i_k0 == 0)
clear_tile(s_acc);
k_tiles[number<i_k0>{}] = load_tile(k_dram_window); k_tiles[number<i_k0>{}] = load_tile(k_dram_window);
if constexpr(i_k0 < k0_loops - 1) if constexpr(i_k0 < k0_loops - 1)
move_tile_window(k_dram_window, {0, kK0}); move_tile_window(k_dram_window, {0, kK0});
...@@ -413,6 +414,9 @@ struct BlockFmhaPipelineQRKSVSAsync ...@@ -413,6 +414,9 @@ struct BlockFmhaPipelineQRKSVSAsync
sequence<((i_k0 % NumKLdsBuffers) + 1) * kN0, kK0>{}); sequence<((i_k0 % NumKLdsBuffers) + 1) * kN0, kK0>{});
store_tile(k_lds_window_tmp, k_tiles[number<i_k0>{}]); store_tile(k_lds_window_tmp, k_tiles[number<i_k0>{}]);
if constexpr(i_k0 == 0)
clear_tile(s_acc);
block_sync_lds(); block_sync_lds();
// execute last unroll of gemm_0 // execute last unroll of gemm_0
gemm_0(s_acc, gemm_0(s_acc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment