Merge remote-tracking branch 'origin/v0.9.2-dev-ds' into v0.9.2-dev-ds

# Conflicts: # vllm/model_executor/layers/fused_moe/ep_moe/layer.py

Merge remote-tracking branch 'origin/v0.9.2-dev-ds' into v0.9.2-dev-ds
# Conflicts: # vllm/model_executor/layers/fused_moe/ep_moe/layer.py
5a5e4f3b · 王敏 · f505d366 · a7992f79 · 5a5e4f3b · 5a5e4f3b
Commit 5a5e4f3b authored Oct 21, 2025 by 王敏
4 changed files
--- a/vllm/model_executor/layers/quantization/slimquant_w4a8.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8.py
@@ -21,6 +21,7 @@ from vllm.utils import W8a8GetCacheJSON

 import os
 from vllm import _custom_ops as ops
+
 from vllm import envs

 try:
@@ -441,3 +442,5 @@ class SlimQuantW4A8Int8MoEMethod:
            shared_output=shared_output,
            routed_scaling_factor=routed_scaling_factor,
        )
+        
+
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -67,7 +67,6 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
 from vllm import _custom_ops as ops
 from vllm.utils import W8a8GetCacheJSON

-os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
 class DeepseekV2MLP(nn.Module):

    def __init__(
@@ -155,7 +154,6 @@ class DeepseekV2MoE(nn.Module):
        vllm_config = get_current_vllm_config()
        parallel_config = vllm_config.parallel_config
        self.enable_eplb = enable_eplb
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'

        self.n_redundant_experts = parallel_config.num_redundant_experts
        self.n_logical_experts = self.n_routed_experts
@@ -227,13 +225,13 @@ class DeepseekV2MoE(nn.Module):
        router_logits, _ = self.gate(hidden_states)

        if not self.use_mori_ep:
-            if envs.VLLM_USE_LIGHTOP and not self.dpsk_fp16_quick:
+            if envs.VLLM_USE_LIGHTOP:
                final_hidden_states = self.experts(
                    hidden_states=hidden_states,
                    router_logits=router_logits,
                    shared_output=shared_output)
            else:
-                if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+                if hidden_states.dtype != torch.float16:
                    final_hidden_states = self.experts(
                        hidden_states=hidden_states,
                        router_logits=router_logits) * self.routed_scaling_factor
@@ -243,7 +241,7 @@ class DeepseekV2MoE(nn.Module):
                    final_hidden_states = self.experts(hidden_states=hidden_states,
                                                    router_logits=router_logits)
                if shared_output is not None:
-                    if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
+                    if hidden_states.dtype != torch.float16:
                        final_hidden_states = final_hidden_states + shared_output
                    else:
                        # Fix FP16 overflow
@@ -671,7 +669,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
        )
-        self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
+
        if (config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0):
@@ -723,8 +721,8 @@ class DeepseekV2DecoderLayer(nn.Module):
                    residual = residual
                )
                residual = new_residual
-
-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+               
+            if hidden_states.dtype == torch.float16:
                # rmsnorm, and rmsnorm result would not affect by scale.
                hidden_states *= 1. / self.routed_scaling_factor
                if self.layer_idx == 0 or residual_fix_overflow:
@@ -735,7 +733,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)

            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.
@@ -760,7 +758,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                hidden_states=hidden_states,
            )

-            if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+            if hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # We scale both hidden_states and residual before
                # rmsnorm, and rmsnorm result would not affect by scale.
@@ -776,7 +774,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states = self.mlp(hidden_states)

            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
+                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
                # Fix FP16 overflow
                # Scaling the DeepseekV2MLP output, it is the input of
                # input_layernorm of next decoder layer.
@@ -925,7 +923,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
        self.tritonsingleton= W8a8GetCacheJSON() 
        self.tritonsingleton.topk = config.num_experts_per_tok
-        self.tritonsingleton.quant_method=self.quant_method 
+        self.tritonsingleton.quant_method=self.quant_method

        parallel_config = vllm_config.parallel_config
        dp_size = get_dp_group().world_size
@@ -1120,7 +1118,10 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
                    if is_pp_missing_parameter(name, self):
                        continue

-                    param = params_dict[name]
+                    try:
+                        param = params_dict[name]
+                    except Exception as e:
+                        continue
                    weight_loader = getattr(param, "weight_loader",
                                            default_weight_loader)
                    weight_loader(param, loaded_weight)

--- a/vllm/two_batch_overlap/v1/model_input_split_v1.py
+++ b/vllm/two_batch_overlap/v1/model_input_split_v1.py
@@ -159,8 +159,6 @@ def prepare_tbo_atten_metadata(
            # The block_table for RIGHT starts from (req_offset-1).
            # Align both offsets to that, and re-build the seq_lens for row-0.
            seq_len_offset = req_offset - 1
-
-            # query_start_offset = req_offset - 1
            query_start_offset = req_offset

            # row-0 is the split request (global row index = req_offset-1):
@@ -182,7 +180,6 @@ def prepare_tbo_atten_metadata(
        else:
            # RIGHT without split-in-req: natural positions
            seq_len_offset = req_offset
-            # query_start_offset = req_offset
            query_start_offset = req_offset + 1
            seq_lens_cpu_local = torch.as_tensor(default_seq_lens, device=runner.seq_lens_cpu.device)


--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import os
 import copy
 import gc
 import time
@@ -1301,7 +1302,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        scheduler_output: "SchedulerOutput",
        intermediate_tensors: Optional[IntermediateTensors] = None,
    ) -> Union[ModelRunnerOutput, IntermediateTensors]:
-        # profile.StartTracer()
        self._update_states(scheduler_output)
        if not scheduler_output.total_num_scheduled_tokens:
            if not has_kv_transfer_group():