"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "07bcd2d4e6fae0bb46ab97a1eecd5ef86f6f4b05"
Unverified Commit d8ddb316 authored by Yi Liu's avatar Yi Liu Committed by GitHub
Browse files

[Bugfix][CT] Fix KV cache scale handling (#39418)


Signed-off-by: default avataryiliu30 <yi4.liu@intel.com>
parent 1ce0318c
...@@ -1123,6 +1123,17 @@ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod): ...@@ -1123,6 +1123,17 @@ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
layer._v_scale = layer.v_scale layer._v_scale = layer.v_scale
layer._q_scale = layer.q_scale layer._q_scale = layer.q_scale
# Set the _float variants that the attention backend uses.
def _to_scalar(tensor: torch.Tensor) -> float:
# For n_scales > 1 (e.g., ATTN_HEAD strategy), take max
if tensor.numel() > 1:
return tensor.max().item()
return tensor.item()
layer._k_scale_float = _to_scalar(layer.k_scale)
layer._v_scale_float = _to_scalar(layer.v_scale)
layer._q_scale_float = _to_scalar(layer.q_scale)
# Discard all placeholders. # Discard all placeholders.
del layer.k_scale del layer.k_scale
del layer.v_scale del layer.v_scale
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment