修复block-wise的错误参数，以及增加dpsk-fp16-dtype的性能选择功能

cb37537e · yangql · 4d479e7e · cb37537e · cb37537e · cb37537e
Commit cb37537e authored Jul 25, 2025 by yangql
3 changed files
--- a/vllm/model_executor/layers/quantization/blockwise_int8.py
+++ b/vllm/model_executor/layers/quantization/blockwise_int8.py
@@ -468,13 +468,16 @@ class BlockInt8MoEMethod:
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
+        enable_eplb: bool = False,
        use_nn_moe: Optional[bool] = False,
        routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
-    
+        **_    
    ) -> torch.Tensor:
        from vllm.model_executor.layers.fused_moe import fused_experts
-        
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `MoeWNA16Method` yet.")        
        # Expert selection
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,

--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -483,7 +483,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
        def moe_wna16_weight_loader(param: torch.nn.Parameter,
                                    loaded_weight: torch.Tensor,
                                    weight_name: str, shard_id: str,
-                                    expert_id: int):
+                                    expert_id: int,
+                                    return_success: bool = False):
            if "g_idx" in weight_name:
                return
            if not layer.quant_config.has_zp and "qzeros" in weight_name:
@@ -539,5 +540,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
            else:
                weight_loader(param, loaded_weight, weight_name, shard_id,
                              expert_id)
-
+            return_success = True
+            return return_success
        return moe_wna16_weight_loader
\ No newline at end of file
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -65,6 +65,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
 from vllm import _custom_ops as ops
 from vllm.utils import W8a8GetCacheJSON

+os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '1')
 class DeepseekV2MLP(nn.Module):

    def __init__(
@@ -138,6 +139,7 @@ class DeepseekV2MoE(nn.Module):
        vllm_config = get_current_vllm_config()
        parallel_config = vllm_config.parallel_config
        self.enable_eplb = enable_eplb
+        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'

        self.n_redundant_experts = parallel_config.num_redundant_experts
        self.n_logical_experts = self.n_routed_experts
@@ -191,7 +193,7 @@ class DeepseekV2MoE(nn.Module):
        # router_logits: (num_tokens, n_experts)
        router_logits, _ = self.gate(hidden_states)

-        if hidden_states.dtype != torch.float16:
+        if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
            final_hidden_states = self.experts(
                hidden_states=hidden_states,
                router_logits=router_logits) * self.routed_scaling_factor
@@ -202,7 +204,7 @@ class DeepseekV2MoE(nn.Module):
                                               router_logits=router_logits)
        
        if shared_output is not None:
-            if hidden_states.dtype != torch.float16:
+            if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
                final_hidden_states = final_hidden_states + shared_output
            else:
                # Fix FP16 overflow
@@ -575,7 +577,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
        )
-
+        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
        if (config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0):
@@ -617,7 +619,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states=hidden_states,
        )

-        if hidden_states.dtype == torch.float16:
+        if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
            # Fix FP16 overflow
            # We scale both hidden_states and residual before
            # rmsnorm, and rmsnorm result would not affect by scale.
@@ -633,7 +635,7 @@ class DeepseekV2DecoderLayer(nn.Module):
        hidden_states = self.mlp(hidden_states)

        if isinstance(self.mlp,
-                      DeepseekV2MLP) and hidden_states.dtype == torch.float16:
+                      DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
            # Fix FP16 overflow
            # Scaling the DeepseekV2MLP output, it is the input of
            # input_layernorm of next decoder layer.