Commit 6d105cc4 authored by xiabo's avatar xiabo
Browse files

add kvint8

parent 45273722
...@@ -131,9 +131,10 @@ class PagedAttention: ...@@ -131,9 +131,10 @@ class PagedAttention:
# TODO(woosuk): Tune this heuristic. # TODO(woosuk): Tune this heuristic.
# For context len > 8192, use V2 kernel to avoid shared memory shortage. # For context len > 8192, use V2 kernel to avoid shared memory shortage.
kvquant = False
if (kv_cache_dtype == "int8"): if (kv_cache_dtype == "int8"):
use_tc = False kvquant = True
if use_tc and head_size==128: if use_tc and head_size==128 and not kvquant:
if envs.VLLM_USE_PA_PRINT_PARAM: if envs.VLLM_USE_PA_PRINT_PARAM:
print("PA V1 SIZE:") print("PA V1 SIZE:")
print(f"query.shape = {query.shape}, key_cache.shape = {key_cache.shape}, value_cache.shape = {value_cache.shape}") print(f"query.shape = {query.shape}, key_cache.shape = {key_cache.shape}, value_cache.shape = {value_cache.shape}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment