[BUG] fix crash on flashinfer backend with cudagraph disabled, when attention...

[BUG] fix crash on flashinfer backend with cudagraph disabled, when attention group_size not in [1,2,4,8] (#7509)

[BUG] fix crash on flashinfer backend with cudagraph disabled, when attention...
[BUG] fix crash on flashinfer backend with cudagraph disabled, when attention group_size not in [1,2,4,8] (#7509)
53328d75 · LI MOU · GitHub · c75363fb · 53328d75 · 53328d75
Unverified Commit 53328d75 authored Aug 21, 2024 by LI MOU Committed by GitHub Aug 21, 2024
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 4 deletions

tests/kernels/test_flashinfer.py tests/kernels/test_flashinfer.py +5 -2

vllm/attention/backends/flashinfer.py vllm/attention/backends/flashinfer.py +4 -2

No files found.
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -4,7 +4,7 @@ import flashinfer
 import pytest
 import torch

-NUM_HEADS = [(16, 16), (32, 8), (64, 8)]
+NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 DTYPES = [torch.float16, torch.bfloat16]
@@ -123,7 +123,10 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],

    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
+                use_tensor_cores=(
+                    (num_query_heads//num_kv_heads) not in (1, 2, 4, 8))
+                )
    wrapper.begin_forward(kv_indptr,
                          kv_indices,
                          kv_last_page_lens,

--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -113,7 +113,8 @@ class FlashInferState(AttentionState):
                self.runner.parallel_config))
            num_kv_heads = self.runner.model_config.get_num_kv_heads(
                self.runner.parallel_config)
-            use_tensor_cores = num_qo_heads // num_kv_heads >= 4
+            use_tensor_cores = (num_qo_heads // num_kv_heads) not in \
+                (1, 2, 4, 8)
            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
                self._get_workspace_buffer(),
                "NHD",
@@ -171,7 +172,8 @@ class FlashInferState(AttentionState):
            self.runner.parallel_config))
        num_kv_heads = self.runner.model_config.get_num_kv_heads(
            self.runner.parallel_config)
-        use_tensor_cores = num_qo_heads // num_kv_heads >= 4
+        use_tensor_cores = (num_qo_heads // num_kv_heads) not in \
+            (1, 2, 4, 8)
        self._graph_decode_wrapper = \
            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
            self._graph_decode_workspace_buffer, _indptr_buffer,