Unverified Commit c8a7d51c authored by Noam Gat's avatar Noam Gat Committed by GitHub
Browse files

[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2...

[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2 OpenAI Server Crash (#6501)
parent e2fbaee7
...@@ -20,6 +20,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, ...@@ -20,6 +20,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
compute_slot_mapping_start_idx, compute_slot_mapping_start_idx,
is_block_tables_empty) is_block_tables_empty)
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.sequence import SequenceGroupMetadata from vllm.sequence import SequenceGroupMetadata
from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad
...@@ -61,14 +62,14 @@ class FlashInferBackend(AttentionBackend): ...@@ -61,14 +62,14 @@ class FlashInferBackend(AttentionBackend):
dst_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor,
src_to_dst: torch.Tensor, src_to_dst: torch.Tensor,
) -> None: ) -> None:
raise NotImplementedError PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
@staticmethod @staticmethod
def copy_blocks( def copy_blocks(
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
src_to_dists: torch.Tensor, src_to_dists: torch.Tensor,
) -> None: ) -> None:
raise NotImplementedError PagedAttention.copy_blocks(kv_caches, src_to_dists)
@staticmethod @staticmethod
def get_supported_head_sizes() -> List[int]: def get_supported_head_sizes() -> List[int]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment