[fix]解决moe_fused_gate编译错误，去掉mla中mtp部分的修改

restore the default settings of disable_cascade_attn add VLLM_USE_OPT_ZEROS to replace triton_ (torch.zeros) set default_max_num_batched_tokens = 10240 update qwen3_moe of layernorm

[fix]解决moe_fused_gate编译错误，去掉mla中mtp部分的修改
restore the default settings of disable_cascade_attn add VLLM_USE_OPT_ZEROS to replace triton_ (torch.zeros) set default_max_num_batched_tokens = 10240 update qwen3_moe of layernorm
b956fc64 · zhuwenwen · 1a9b2fa9 · b956fc64 · b956fc64 · b956fc64
Commit b956fc64 authored Nov 13, 2025 by zhuwenwen
6 changed files
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -219,9 +219,14 @@ class Attention(nn.Module):
        if self.use_output:
            output_shape = (output_shape
                            if output_shape is not None else query.shape)
-            output = torch.zeros(output_shape,
-                                 dtype=query.dtype,
-                                 device=query.device)
+            if envs.VLLM_USE_OPT_ZEROS:
+                output = torch.empty(output_shape,
+                                    dtype=query.dtype,
+                                    device=query.device)
+            else:
+                output = torch.zeros(output_shape,
+                                    dtype=query.dtype,
+                                    device=query.device)
            hidden_size = output_shape[-1]
            # We skip reshaping query, key and value tensors for the MLA
            # backend since these tensors have different semantics and are

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -326,7 +326,7 @@ class ModelConfig:
    """Whether to disable sliding window. If True, we will disable the sliding
    window functionality of the model, capping to sliding window size. If the
    model does not support sliding window, this argument is ignored."""
-    disable_cascade_attn: bool = True
+    disable_cascade_attn: bool = False
    """Disable cascade attention for V1. While cascade attention does not
    change the mathematical correctness, disabling it could be useful for
    preventing potential numerical issues. Note that even if this is set to

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1664,7 +1664,7 @@ class EngineArgs:
            # TODO(woosuk): Tune the default values for other hardware.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 8192,
-                UsageContext.OPENAI_API_SERVER: 2048,
+                UsageContext.OPENAI_API_SERVER: 10240,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 256,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1107,6 +1107,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_LIGHTOP":
        lambda: (os.environ.get("VLLM_USE_LIGHTOP", "False").lower() in
                 ("true", "1")),
+    # vLLM will use elenmentwise not triton_
+    "VLLM_USE_OPT_ZEROS":
+        lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "False").lower() in
+                 ("true", "1")),
    # vLLM will use opt cat for deepseek-v3
    "VLLM_USE_OPT_CAT":
        lambda: (os.environ.get("VLLM_USE_OPT_CAT", "False").lower() in

--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -224,7 +224,7 @@ class DeepSeekMTP(nn.Module, SupportsPP):
        params_dict = dict(self.named_parameters())
        loaded_params: set[str] = set()
        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
+            if "rotary_emb.inv_freq" in name or "indexer" in name:
                continue
            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
            if spec_layer is None:

--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -234,7 +234,7 @@ class Qwen3MoeAttention(nn.Module):
        if envs.VLLM_USE_APEX_RN:
            q_by_head = self.q_norm.forward_apex(q_by_head)
        else:
-            q_by_head = self.q_norm(q_by_head)
+            q_by_head = self.q_norm.forward_cuda(q_by_head)
        q = q_by_head.view(q.shape)

        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
@@ -242,7 +242,7 @@ class Qwen3MoeAttention(nn.Module):
        if envs.VLLM_USE_APEX_RN:
            k_by_head = self.k_norm.forward_apex(k_by_head)
        else:
-            k_by_head = self.k_norm(k_by_head)
+            k_by_head = self.k_norm.forward_cuda(k_by_head)
        k = k_by_head.view(k.shape)
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)