Commit 6d105cc4 authored by xiabo's avatar xiabo
Browse files

add kvint8

parent 45273722
......@@ -131,9 +131,10 @@ class PagedAttention:
# TODO(woosuk): Tune this heuristic.
# For context len > 8192, use V2 kernel to avoid shared memory shortage.
kvquant = False
if (kv_cache_dtype == "int8"):
use_tc = False
if use_tc and head_size==128:
kvquant = True
if use_tc and head_size==128 and not kvquant:
if envs.VLLM_USE_PA_PRINT_PARAM:
print("PA V1 SIZE:")
print(f"query.shape = {query.shape}, key_cache.shape = {key_cache.shape}, value_cache.shape = {value_cache.shape}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment