删除DPSK_FP16_QUICK，以及增加awq和blockwiseint8的shared_output接口

7f459b46 · yangql · c3b8a0ae · 7f459b46 · 7f459b46 · 7f459b46
Commit 7f459b46 authored Oct 15, 2025 by yangql
7 changed files
--- a/vllm/model_executor/layers/fused_moe/ep_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/ep_moe/layer.py
@@ -238,8 +238,6 @@ class EPMoE(FusedMoE):
        self.shared_expert_overlap = moe_shared_expert_overlap
        self.shared_experts = None
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
        self.scales = None
        self.use_int8_dispatch = True
@@ -435,7 +433,7 @@ class EPMoE(FusedMoE):
            #         self.maybe_all_reduce_tensor_model_parallel(
            #             shared_output))
-            if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+            if hidden_states.dtype != torch.float16:
                final_hidden_states = final_hidden_states + shared_output
            else:
                # Fix FP16 overflow

--- a/vllm/model_executor/layers/fused_moe/ep_moe/token_dispatcher.py
+++ b/vllm/model_executor/layers/fused_moe/ep_moe/token_dispatcher.py
@@ -181,7 +181,6 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
        self.use_all_gather = current_platform.use_all_gather()
        self.probs = None
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
        # For smuggling this layer into the fused moe custom op
        vllm_config = get_current_vllm_config()
@@ -446,7 +445,7 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
        if self.config.moe_shared_expert_overlap and self.shared_experts is not None:
            shared_output = self.shared_experts.get_output()
-            if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+            if hidden_states.dtype != torch.float16:
                output = output + shared_output
            else:
                # Fix FP16 overflow

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -45,9 +45,6 @@ from lightop import op
 # from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
-os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
-dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
 logger = init_logger(__name__)
 if envs.VLLM_USE_GLOBAL_CACHE13:
@@ -1899,7 +1896,7 @@ def fused_experts_impl(
                                block_shape=block_shape,
                                use_nn_moe=use_nn_moe)
-        if envs.VLLM_USE_LIGHTOP and not dpsk_fp16_quick: 
+        if envs.VLLM_USE_LIGHTOP: 
            from lightop import op as op
            op.moe_sum(input=intermediate_cache3.view(*intermediate_cache3.size()),
                    output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=shared_output[begin_chunk_idx:end_chunk_idx], 

--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -514,6 +514,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        shared_output: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if enable_eplb:
            raise NotImplementedError(

--- a/vllm/model_executor/layers/quantization/blockwise_int8.py
+++ b/vllm/model_executor/layers/quantization/blockwise_int8.py
@@ -473,6 +473,7 @@ class BlockInt8MoEMethod:
        use_nn_moe: Optional[bool] = False,
        routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
+        shared_output: Optional[torch.Tensor] = None,
        **_    
    ) -> torch.Tensor:
        from vllm.model_executor.layers.fused_moe import fused_experts
@@ -514,5 +515,6 @@ class BlockInt8MoEMethod:
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            block_shape=self.quant_config.weight_block_size,
-            use_nn_moe=use_nn_moe
+            use_nn_moe=use_nn_moe,
+            shared_output=shared_output,
        )
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -348,6 +348,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        shared_output: Optional[torch.Tensor] = None,
        use_nn_moe: Optional[bool] = False,
        routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
@@ -430,7 +431,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
            w1_zp=layer.w13_qzeros if has_zp else None,
            w2_zp=layer.w2_qzeros if has_zp else None,
            block_shape=[0, layer.group_size],
-            use_nn_moe=False)
+            use_nn_moe=False,
+            shared_output=shared_output,
+            )
    @staticmethod
    def get_weight_loader(layer, weight_loader):

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -67,7 +67,6 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
 from vllm import _custom_ops as ops
 from vllm.utils import W8a8GetCacheJSON
-os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
 class DeepseekV2MLP(nn.Module):
    def __init__(
@@ -155,7 +154,6 @@ class DeepseekV2MoE(nn.Module):
        vllm_config = get_current_vllm_config()
        parallel_config = vllm_config.parallel_config
        self.enable_eplb = enable_eplb
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
        self.n_redundant_experts = parallel_config.num_redundant_experts
        self.n_logical_experts = self.n_routed_experts
@@ -227,13 +225,13 @@ class DeepseekV2MoE(nn.Module):
        router_logits, _ = self.gate(hidden_states)
        if not self.use_mori_ep:
-            if envs.VLLM_USE_LIGHTOP and not self.dpsk_fp16_quick:
+            if envs.VLLM_USE_LIGHTOP:
                final_hidden_states = self.experts(
                    hidden_states=hidden_states,
                    router_logits=router_logits,
                    shared_output=shared_output)
            else:
-                if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+                if hidden_states.dtype != torch.float16:
                    final_hidden_states = self.experts(
                        hidden_states=hidden_states,
                        router_logits=router_logits) * self.routed_scaling_factor
@@ -243,7 +241,7 @@ class DeepseekV2MoE(nn.Module):
                    final_hidden_states = self.experts(hidden_states=hidden_states,
                                                    router_logits=router_logits)
                if shared_output is not None:
-                    if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+                    if hidden_states.dtype != torch.float16:
                        final_hidden_states = final_hidden_states + shared_output
                    else:
                        # Fix FP16 overflow
@@ -671,7 +669,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
        )
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
        if (config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0):
@@ -724,7 +722,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                )
                residual = new_residual
-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+            if hidden_states.dtype == torch.float16:
                # rmsnorm, and rmsnorm result would not affect by scale.
                hidden_states *= 1. / self.routed_scaling_factor
                if self.layer_idx == 0 or residual_fix_overflow:
@@ -735,7 +733,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)
            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.
@@ -760,7 +758,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                hidden_states=hidden_states,
            )
-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+            if hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # We scale both hidden_states and residual before
                # rmsnorm, and rmsnorm result would not affect by scale.
@@ -776,7 +774,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states = self.mlp(hidden_states)
            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.