"tests/vscode:/vscode.git/clone" did not exist on "e82ee40de3362afda8671e6f5daece0eaa7f0d51"
Commit 1092a467 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_PA_PRINT_PARAM to print fa-pa size

parent 91feb245
...@@ -970,6 +970,12 @@ class ROCmFlashAttentionImpl(AttentionImpl): ...@@ -970,6 +970,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
tree_attention_masks_tensor = decode_meta.tree_attention_masks_tensor tree_attention_masks_tensor = decode_meta.tree_attention_masks_tensor
if envs.VLLM_USE_FLASH_ATTN_PA: if envs.VLLM_USE_FLASH_ATTN_PA:
from flash_attn import vllm_flash_attn_with_kvcache from flash_attn import vllm_flash_attn_with_kvcache
if envs.VLLM_USE_PA_PRINT_PARAM:
print("PA SIZE:")
print(f"q.shape = {decode_query.unsqueeze(1).shape}, key_cache.shape = {key_cache.shape}, value_cache.shape = {value_cache.shape}, kv_cache_dtype = {self.kv_cache_dtype}")
print(f"block_size= {block_size}, cache_seqlens.shape = {decode_meta.seq_lens_tensor.shape}, block_tables.shape = {decode_meta.block_tables.shape}")
print(f"softmax_scale = {self.scale:.3f}, window_size = {self.sliding_window}, softcap = {self.logits_soft_cap}, alibi_slopes = {self.alibi_slopes}")
# output[num_prefill_tokens:] = self.fa_decode_attn_func( # output[num_prefill_tokens:] = self.fa_decode_attn_func(
output[num_prefill_tokens:] = vllm_flash_attn_with_kvcache( output[num_prefill_tokens:] = vllm_flash_attn_with_kvcache(
q=decode_query.unsqueeze(1), q=decode_query.unsqueeze(1),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment