"examples/vscode:/vscode.git/clone" did not exist on "41ae4a1eab22b332cd7d8f233f09611a5da53b1c"
Commit 14945681 authored by zhuwenwen's avatar zhuwenwen
Browse files

only K100_AI can use cutlass prefix cache

parent bbd14169
...@@ -857,7 +857,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): ...@@ -857,7 +857,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
else: else:
# prefix-enabled attention - # prefix-enabled attention -
# not applicable for encoder-only models # not applicable for encoder-only models
if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or gpuname.startswith('BW'): if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or (not gpuname.startswith('K100_AI')):
version_key = triton_key() version_key = triton_key()
if self.attn_type != AttentionType.ENCODER_ONLY: if self.attn_type != AttentionType.ENCODER_ONLY:
output[:num_prefill_tokens] = paged_attn.forward_prefix( output[:num_prefill_tokens] = paged_attn.forward_prefix(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment