删除DPSK_FP16_QUICK

1851782d · zhuwenwen · 8e22ded2 · 1851782d
Commit 1851782d authored Oct 31, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 9 deletions

vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_v2.py +5 -9

No files found.
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -89,8 +89,6 @@ if current_platform.is_cuda_alike():
 elif current_platform.is_xpu():
    from vllm._ipex_ops import ipex_ops as ops

-os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
-
 logger = init_logger(__name__)


@@ -191,7 +189,6 @@ class DeepseekV2MoE(nn.Module):
        # Load balancing settings.
        eplb_config = parallel_config.eplb_config
        self.enable_eplb = parallel_config.enable_eplb
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'

        self.n_redundant_experts = eplb_config.num_redundant_experts
        self.n_logical_experts = self.n_routed_experts
@@ -288,7 +285,7 @@ class DeepseekV2MoE(nn.Module):
        
        # Fix FP16 overflow
        # See DeepseekV2DecoderLayer for more details.
-        if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+        if hidden_states.dtype != torch.float16:
            final_hidden_states *= self.routed_scaling_factor
        elif self.shared_experts is not None:
            assert shared_output is not None
@@ -1054,7 +1051,6 @@ class DeepseekV2DecoderLayer(nn.Module):
            prefix=f"{prefix}.self_attn",
            topk_indices_buffer=topk_indices_buffer,
        )
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
        if (config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0):
@@ -1107,7 +1103,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                )
                residual = new_residual
                
-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+            if hidden_states.dtype == torch.float16:
                # rmsnorm, and rmsnorm result would not affect by scale.
                hidden_states *= 1. / self.routed_scaling_factor
                if self.layer_idx == 0 or residual_fix_overflow:
@@ -1118,7 +1114,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)

            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.
@@ -1142,7 +1138,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                hidden_states=hidden_states,
            )

-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+            if hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # We scale both hidden_states and residual before
                # rmsnorm, and rmsnorm result would not affect by scale.
@@ -1157,7 +1153,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                hidden_states, residual)
            hidden_states = self.mlp(hidden_states)
            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.