解决TORCHDYNAMO跟fp8 scale冲突的问题

4a0dfa15 · zhangshao · bf93e83b · 4a0dfa15
Commit 4a0dfa15 authored Mar 05, 2026 by zhangshao
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 8 deletions

vllm/attention/layer.py vllm/attention/layer.py +9 -8

No files found.
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -291,6 +291,7 @@ class Attention(nn.Module, AttentionLayerBase):
            #     self.calc_kv_scales(query, key, value)
            torch.ops.vllm.maybe_calc_kv_scales(query, key, value,
                                                self.layer_name)
+            self.check_fp8_overflow = False
        output_dtype = query.dtype
        if self.query_quant is not None:
@@ -358,12 +359,6 @@ class Attention(nn.Module, AttentionLayerBase):
                    query, key, value, self.layer_name)
    def calc_kv_scales(self, query, key, value):
-        self.check_fp8_overflow = False
-        if self.calculate_kv_scales == False :
-            if self.kv_cache_dtype in {"fp8", "fp8_e4m3"} and torch.abs(query).max().item()<=200 : #check fp8  overflow
-                return
-            if self.kv_cache_dtype in {"fp8_e5m2"} and torch.abs(query).max().item()>=0.01 : #check fp8 too small
-                return
        bias=0.0 # add bias to avoid q values are too small(or zeros) and scales are not correct
        if torch.abs(query).max().item() < 0.01:
            if self.kv_cache_dtype in {"fp8_e5m2"}:
@@ -601,7 +596,13 @@ def maybe_calc_kv_scales(
    # Only calculate if the layer's calculate_kv_scales flag is True
    # This flag gets set to False after the first forward pass
+    if self.check_fp8_overflow :
+        if self.kv_cache_dtype in {"fp8", "fp8_e4m3"} and torch.abs(query).max().item()>200 : #check fp8  overflow
+            self.calculate_kv_scales = True
+        if self.kv_cache_dtype in {"fp8_e5m2"} and torch.abs(query).max().item()<0.01 : #check fp8 too small
+            self.calculate_kv_scales = True
+    if not self.calculate_kv_scales:
+        return
    self.calc_kv_scales(query, key, value)