add kvint8

6d105cc4 · xiabo · 45273722 · 6d105cc4
Commit 6d105cc4 authored May 14, 2025 by xiabo
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

vllm/attention/ops/paged_attn.py vllm/attention/ops/paged_attn.py +3 -2

No files found.
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -131,9 +131,10 @@ class PagedAttention:
        # TODO(woosuk): Tune this heuristic.
        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+        kvquant = False
        if (kv_cache_dtype == "int8"):
-            use_tc = False
+            kvquant = True
-        if use_tc and head_size==128:
+        if use_tc and head_size==128 and not kvquant:
            if envs.VLLM_USE_PA_PRINT_PARAM:
                print("PA V1 SIZE:")
                print(f"query.shape = {query.shape}, key_cache.shape = {key_cache.shape}, value_cache.shape = {value_cache.shape}")