解决fp8 kv cache scale错误问题

bf93e83b · zhangshao · 3fe3d07c · bf93e83b
Commit bf93e83b authored Mar 04, 2026 by zhangshao
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

vllm/attention/layer.py vllm/attention/layer.py +3 -2

No files found.
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -358,10 +358,11 @@ class Attention(nn.Module, AttentionLayerBase):
                    query, key, value, self.layer_name)
    def calc_kv_scales(self, query, key, value):
+        self.check_fp8_overflow = False
        if self.calculate_kv_scales == False :
            if self.kv_cache_dtype in {"fp8", "fp8_e4m3"} and torch.abs(query).max().item()<=200 : #check fp8  overflow
                return
-            if  torch.abs(query).max().item()>=0.01 : #check fp8 too small
+            if self.kv_cache_dtype in {"fp8_e5m2"} and torch.abs(query).max().item()>=0.01 : #check fp8 too small
                return
        bias=0.0 # add bias to avoid q values are too small(or zeros) and scales are not correct
        if torch.abs(query).max().item() < 0.01:
@@ -378,7 +379,7 @@ class Attention(nn.Module, AttentionLayerBase):
        self._v_scale_float = self._v_scale.item()
        # We only calculate the scales once
        self.calculate_kv_scales = False
-        self.check_fp8_overflow = False
    def extra_repr(self) -> str:
        s = f"head_size={self.impl.head_size}"  # type: ignore