[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2...

[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2 OpenAI Server Crash (#6501)

[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2...
[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2 OpenAI Server Crash (#6501)
c8a7d51c · Noam Gat · GitHub · e2fbaee7 · c8a7d51c
Unverified Commit c8a7d51c authored Jul 18, 2024 by Noam Gat Committed by GitHub Jul 18, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

vllm/attention/backends/flashinfer.py vllm/attention/backends/flashinfer.py +3 -2

No files found.
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -20,6 +20,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                           compute_slot_mapping_start_idx,
                                           is_block_tables_empty)
+from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.sequence import SequenceGroupMetadata
 from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad

@@ -61,14 +62,14 @@ class FlashInferBackend(AttentionBackend):
        dst_kv_cache: torch.Tensor,
        src_to_dst: torch.Tensor,
    ) -> None:
-        raise NotImplementedError
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)

    @staticmethod
    def copy_blocks(
        kv_caches: List[torch.Tensor],
        src_to_dists: torch.Tensor,
    ) -> None:
-        raise NotImplementedError
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)

    @staticmethod
    def get_supported_head_sizes() -> List[int]: