Tiny adjustment in qr_ks_vs_async pipeline for better performance

80c84d08 · Qianfeng Zhang · 59e3cb05 · 80c84d08
Commit 80c84d08 authored Jan 24, 2025 by Qianfeng Zhang
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp .../ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp +6 -4

No files found.
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -283,14 +283,13 @@ struct BlockFmhaPipelineQRKSVSAsync
        static_assert(1 <= k1_loops);
        do
        {
-            // STAGE 1, QK gemm
-            clear_tile(s_acc); // initialize C
-
            store_tile(k_lds_window, k_tile);
-            block_sync_lds();

            __builtin_amdgcn_sched_barrier(0);

+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
            if(i_total_loops < num_total_loop - 1)
            {
                move_tile_window(k_dram_window, {kN0, 0});
@@ -299,6 +298,9 @@ struct BlockFmhaPipelineQRKSVSAsync

            __builtin_amdgcn_sched_barrier(0);

+            // ensure k is completely updated on LDS
+            block_sync_lds();
+
            // for kQKHeaddim == 96 (kSubQKHeaddim == 128), we need to use k0_loops
            if constexpr(kQKHeaddim == kSubQKHeaddim)
            {