修复residual FP16 overflow，解决mtp采样率和数据集精度的冲突

0369a0a6 · zhuwenwen · ab1ea369 · 0369a0a6
Commit 0369a0a6 authored Aug 29, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_v2.py +6 -2

No files found.
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -66,7 +66,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
 from vllm import _custom_ops as ops
 from vllm.utils import W8a8GetCacheJSON
-os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '1')
+os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
 class DeepseekV2MLP(nn.Module):
    def __init__(
@@ -621,9 +621,13 @@ class DeepseekV2DecoderLayer(nn.Module):
        residual: Optional[torch.Tensor],
    ) -> torch.Tensor:
        # Self Attention
+        # Fix residual FP16 overflow
+        residual_fix_overflow = False
        if residual is None:
            residual = hidden_states
            hidden_states = self.input_layernorm(hidden_states)
+            residual_fix_overflow = True
        else:
            hidden_states, residual = self.input_layernorm(
                hidden_states, residual)
@@ -637,7 +641,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            # We scale both hidden_states and residual before
            # rmsnorm, and rmsnorm result would not affect by scale.
            hidden_states *= 1. / self.routed_scaling_factor
-            if self.layer_idx == 0:
+            if self.layer_idx == 0 or residual_fix_overflow:
                # The residual is shared by all layers, we only scale it on
                # first layer.
                residual *= 1. / self.routed_scaling_factor