Merge branch 'v0.9.2-dev-yql' into 'v0.9.2-dev'

删除DPSK_FP16_QUICK，以及增加awq和blockwiseint8的shared_output接口 See merge request dcutoolkit/deeplearing/vllm!228

Merge branch 'v0.9.2-dev-yql' into 'v0.9.2-dev'
删除DPSK_FP16_QUICK，以及增加awq和blockwiseint8的shared_output接口 See merge request dcutoolkit/deeplearing/vllm!228
1b98d0bb · zhuwenwen · 0004c57a · 50cb9270 · 1b98d0bb · 1b98d0bb
Commit 1b98d0bb authored Oct 15, 2025 by zhuwenwen
5 changed files
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -43,9 +43,6 @@ from vllm.utils import direct_register_custom_op

 # from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled

-    
-os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
-dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'

 logger = init_logger(__name__)

@@ -1898,7 +1895,7 @@ def fused_experts_impl(
                                block_shape=block_shape,
                                use_nn_moe=use_nn_moe)

-        if envs.VLLM_USE_LIGHTOP and not dpsk_fp16_quick: 
+        if envs.VLLM_USE_LIGHTOP: 
            from lightop import op as op
            op.moe_sum(input=intermediate_cache3.view(*intermediate_cache3.size()),
                    output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=shared_output[begin_chunk_idx:end_chunk_idx], 

--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -514,6 +514,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        shared_output: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if enable_eplb:
            raise NotImplementedError(

--- a/vllm/model_executor/layers/quantization/blockwise_int8.py
+++ b/vllm/model_executor/layers/quantization/blockwise_int8.py
@@ -473,6 +473,7 @@ class BlockInt8MoEMethod:
        use_nn_moe: Optional[bool] = False,
        routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
+        shared_output: Optional[torch.Tensor] = None,
        **_    
    ) -> torch.Tensor:
        from vllm.model_executor.layers.fused_moe import fused_experts
@@ -514,5 +515,6 @@ class BlockInt8MoEMethod:
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            block_shape=self.quant_config.weight_block_size,
-            use_nn_moe=use_nn_moe
+            use_nn_moe=use_nn_moe,
+            shared_output=shared_output,
        )
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -348,6 +348,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        shared_output: Optional[torch.Tensor] = None,
        use_nn_moe: Optional[bool] = False,
        routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
@@ -430,7 +431,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
            w1_zp=layer.w13_qzeros if has_zp else None,
            w2_zp=layer.w2_qzeros if has_zp else None,
            block_shape=[0, layer.group_size],
-            use_nn_moe=False)
+            use_nn_moe=False,
+            shared_output=shared_output,)

    @staticmethod
    def get_weight_loader(layer, weight_loader):

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -65,7 +65,6 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
 from vllm import _custom_ops as ops
 from vllm.utils import W8a8GetCacheJSON

-os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
 class DeepseekV2MLP(nn.Module):

    def __init__(
@@ -153,7 +152,6 @@ class DeepseekV2MoE(nn.Module):
        vllm_config = get_current_vllm_config()
        parallel_config = vllm_config.parallel_config
        self.enable_eplb = enable_eplb
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'

        self.n_redundant_experts = parallel_config.num_redundant_experts
        self.n_logical_experts = self.n_routed_experts
@@ -216,13 +214,13 @@ class DeepseekV2MoE(nn.Module):
        # router_logits: (num_tokens, n_experts)
        router_logits, _ = self.gate(hidden_states)

-        if envs.VLLM_USE_LIGHTOP and not self.dpsk_fp16_quick:
+        if envs.VLLM_USE_LIGHTOP:
            final_hidden_states = self.experts(
                hidden_states=hidden_states,
                router_logits=router_logits,
                shared_output=shared_output)
        else:
-            if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+            if hidden_states.dtype != torch.float16:
                final_hidden_states = self.experts(
                    hidden_states=hidden_states,
                    router_logits=router_logits) * self.routed_scaling_factor
@@ -233,7 +231,7 @@ class DeepseekV2MoE(nn.Module):
                                                router_logits=router_logits)
        
        if shared_output is not None:
-            if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+            if hidden_states.dtype != torch.float16:
                final_hidden_states = final_hidden_states + shared_output
            else:
                # Fix FP16 overflow
@@ -658,7 +656,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
        )
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
+
        if (config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0):
@@ -711,7 +709,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                )
                residual = new_residual
                
-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+            if hidden_states.dtype == torch.float16:
                # rmsnorm, and rmsnorm result would not affect by scale.
                hidden_states *= 1. / self.routed_scaling_factor
                if self.layer_idx == 0 or residual_fix_overflow:
@@ -722,7 +720,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)

            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.
@@ -747,7 +745,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                hidden_states=hidden_states,
            )

-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+            if hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # We scale both hidden_states and residual before
                # rmsnorm, and rmsnorm result would not affect by scale.
@@ -763,7 +761,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states = self.mlp(hidden_states)

            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.