Merge branch 'v0.9.2-dev' into v0.9.2-dev-ds

2805c93e · zhuwenwen · c8de4a43 · 1b98d0bb · 2805c93e · 2805c93e
Commit 2805c93e authored Oct 15, 2025 by zhuwenwen
6 changed files
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
 | Qwen3MoeForCausalLM            | QWen3MoE                                                    | Yes | - | - | v0.8.4   | Yes |
 | ChatGLMModel                   | glm-4v-9b,chatglm3,chatglm2                                 | Yes | No  | Yes | v0.5.0   | Yes |
 | Glm4ForCausalLM                | GLM-4-0414                                                  | No/Yes | -  | - | v0.8.5.post1   | Yes |
+| Glm4MoeForCausalLM             | GLM-4.5,GLM-4.5-Air                                         | No/Yes | -  | - | v0.9.2  | Yes |
 | DeepseekForCausalLM            | Deepseek                                                    | Yes | No  | -   | v0.5.0  | Yes |
 | DeepseekV2ForCausalLM          | DeepSeek-V2                                                 | Yes | No  | -   | v0.6.2  | Yes |
 | DeepseekVLV2ForCausalLM        | DeepSeek-VL2                                                | Yes | No  | -   | v0.7.2  | Yes |

--- a/setup.py
+++ b/setup.py
@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
            if sha is None:
                sha = get_sha(vllm_root)
            if (major, minor) >= ('2', '5'):
-                version = 'das.opt1.rc2.' + sha[:7]
+                version = 'das.opt1.' + sha[:7]
    else:
        if (major, minor) >= ('2', '5'):
-            version = 'das.opt1.rc2'
+            version = 'das.opt1'


    # dtk version

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -418,6 +418,9 @@ class ModelConfig:
    - "transformers" will use the Transformers model implementation."""
    override_attention_dtype: Optional[str] = None
    """Override dtype for attention"""
+    enable_chunked_prefill: Optional[bool] = None
+    """If True, prefill requests can be chunked based
+    on the remaining max_num_batched_tokens."""

    def compute_hash(self) -> str:
        """
@@ -448,6 +451,7 @@ class ModelConfig:
        factors.append(self.rope_theta)
        # hf_config can control how the model looks!
        factors.append(self.hf_config.to_json_string())
+        factors.append(self.enable_chunked_prefill)
        str_factors = str(factors)
        assert_hashable(str_factors)
        return hashlib.sha256(str(factors).encode()).hexdigest()

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1004,6 +1004,7 @@ class EngineArgs:
            enable_sleep_mode=self.enable_sleep_mode,
            model_impl=self.model_impl,
            override_attention_dtype=self.override_attention_dtype,
+            enable_chunked_prefill=self.enable_chunked_prefill,
        )

    def create_load_config(self) -> LoadConfig:
@@ -1593,6 +1594,9 @@ class EngineArgs:
        # For pooling tasks the default is False
        if model_config.runner_type != "pooling":
            self.enable_chunked_prefill = True
+            if model_config.enable_chunked_prefill is not None and \
+                model_config.enable_chunked_prefill is False:
+                self.enable_chunked_prefill = False
            if self.enable_prefix_caching is None:
                self.enable_prefix_caching = True
        else:
@@ -1607,6 +1611,10 @@ class EngineArgs:
            action = "Enabling" if \
                incremental_prefill_supported else "Disabling"
            
+            if model_config.enable_chunked_prefill is not None and \
+                model_config.enable_chunked_prefill is False:
+                self.enable_chunked_prefill = False
+
            if self.enable_chunked_prefill is None:
                self.enable_chunked_prefill = incremental_prefill_supported
                logger.info("(%s) chunked prefill by default", action)

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1113,11 +1113,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
                 ("true", "1")),  
    # vLLM will use lightop moe_sum 
    "VLLM_USE_LIGHTOP_MOE_SUM":
-        lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_SUM", "False").lower() in
+        lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_SUM", "True").lower() in
                 ("true", "1")),  
    # vLLM will use lightop moe_align_block_size 
    "VLLM_USE_LIGHTOP_MOE_ALIGN":
-        lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_ALIGN", "False").lower() in
+        lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_ALIGN", "True").lower() in
                 ("true", "1")),     
    # vLLM will use opt merge_aatn_states, not triton
    "VLLM_USE_MERGE_ATTN_STATES_OPT":

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -252,6 +252,23 @@ class DeepseekV2MoE(nn.Module):
            final_hidden_states = self.experts(hidden_states=hidden_states,
                                                router_logits=router_logits)
        
+        if shared_output is not None:
+            if hidden_states.dtype != torch.float16:
+                final_hidden_states = final_hidden_states + shared_output
+            else:
+                # Fix FP16 overflow
+                # See DeepseekV2DecoderLayer for more details.
+                final_hidden_states = final_hidden_states + shared_output \
+                    * (1. / self.routed_scaling_factor)
+
+        if self.tp_size > 1:
+            if envs.VLLM_ENABLE_TBO:
+                final_hidden_states = self.tbo_all_reduce(final_hidden_states)
+            else:
+                final_hidden_states = (
+                    self.experts.maybe_all_reduce_tensor_model_parallel(
+                        final_hidden_states))
+
        if not self.use_mori_ep:
            if self.tp_size > 1:
                if envs.VLLM_ENABLE_TBO: