Commit 80c84d08 authored by Qianfeng Zhang's avatar Qianfeng Zhang
Browse files

Tiny adjustment in qr_ks_vs_async pipeline for better performance

parent 59e3cb05
......@@ -283,14 +283,13 @@ struct BlockFmhaPipelineQRKSVSAsync
static_assert(1 <= k1_loops);
do
{
// STAGE 1, QK gemm
clear_tile(s_acc); // initialize C
store_tile(k_lds_window, k_tile);
block_sync_lds();
__builtin_amdgcn_sched_barrier(0);
// STAGE 1, QK gemm
clear_tile(s_acc); // initialize C
if(i_total_loops < num_total_loop - 1)
{
move_tile_window(k_dram_window, {kN0, 0});
......@@ -299,6 +298,9 @@ struct BlockFmhaPipelineQRKSVSAsync
__builtin_amdgcn_sched_barrier(0);
// ensure k is completely updated on LDS
block_sync_lds();
// for kQKHeaddim == 96 (kSubQKHeaddim == 128), we need to use k0_loops
if constexpr(kQKHeaddim == kSubQKHeaddim)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment