add VLLM_USE_PA_PRINT_PARAM to print fa-pa size

1092a467 · zhuwenwen · 91feb245 · 1092a467
Commit 1092a467 authored Jul 17, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 0 deletions

vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/rocm_flash_attn.py +6 -0

No files found.
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -970,6 +970,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                tree_attention_masks_tensor = decode_meta.tree_attention_masks_tensor
                if envs.VLLM_USE_FLASH_ATTN_PA:
                    from flash_attn import vllm_flash_attn_with_kvcache
+                    if envs.VLLM_USE_PA_PRINT_PARAM:
+                        print("PA SIZE:")
+                        print(f"q.shape = {decode_query.unsqueeze(1).shape}, key_cache.shape = {key_cache.shape}, value_cache.shape = {value_cache.shape}, kv_cache_dtype = {self.kv_cache_dtype}")
+                        print(f"block_size= {block_size}, cache_seqlens.shape = {decode_meta.seq_lens_tensor.shape}, block_tables.shape = {decode_meta.block_tables.shape}")
+                        print(f"softmax_scale = {self.scale:.3f}, window_size = {self.sliding_window}, softcap = {self.logits_soft_cap}, alibi_slopes = {self.alibi_slopes}")
                    # output[num_prefill_tokens:] = self.fa_decode_attn_func(
                    output[num_prefill_tokens:] = vllm_flash_attn_with_kvcache(
                        q=decode_query.unsqueeze(1),