Commit ee324424 authored by Qianfeng Zhang's avatar Qianfeng Zhang
Browse files

Move the code line of clear_tile(s_acc)

parent 1a6a3715
......@@ -335,12 +335,13 @@ struct BlockFmhaPipelineQRKSVSAsync
{
store_tile(k_lds_windows[I0], k_tiles[I0]);
clear_tile(s_acc); // initialize C
static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
k_tiles[number<i_k0 + 1>{}] = load_tile(k_dram_window);
move_tile_window(k_dram_window, {0, kK0});
if constexpr(i_k0 == 0)
clear_tile(s_acc);
block_sync_lds();
// execute current unroll of gemm_0
gemm_0(s_acc,
......@@ -372,8 +373,6 @@ struct BlockFmhaPipelineQRKSVSAsync
{
store_tile(k_lds_windows[I0], k_tiles[I0]);
clear_tile(s_acc); // initialize C
static_for<0, k0_loops, 1>{}([&](auto i_k0) {
if constexpr(i_k0 < k0_loops - 1)
{
......@@ -381,6 +380,9 @@ struct BlockFmhaPipelineQRKSVSAsync
move_tile_window(k_dram_window, {0, kK0});
};
if constexpr(i_k0 == 0)
clear_tile(s_acc);
block_sync_lds();
// execute current unroll of gemm_0
gemm_0(s_acc,
......@@ -443,8 +445,6 @@ struct BlockFmhaPipelineQRKSVSAsync
{
store_tile(k_lds_windows[I0], k_tiles[I0]);
clear_tile(s_acc); // initialize C
static_for<0, k0_loops, 1>{}([&](auto i_k0) {
if constexpr(i_k0 < k0_loops - 1)
{
......@@ -452,6 +452,9 @@ struct BlockFmhaPipelineQRKSVSAsync
move_tile_window(k_dram_window, {0, kK0});
};
if constexpr(i_k0 == 0)
clear_tile(s_acc);
block_sync_lds();
// execute current unroll of gemm_0
gemm_0(s_acc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment