add VLLM_USE_OPT_ZEROS to replace triton_ (torch.zeros)

set default_max_num_batched_tokens = 10240 update qwen3_moe of layernorm off lightop of moe_fused_gate

add VLLM_USE_OPT_ZEROS to replace triton_ (torch.zeros)
set default_max_num_batched_tokens = 10240 update qwen3_moe of layernorm off lightop of moe_fused_gate
e7f2785f · zhuwenwen · 671dcfff · e7f2785f · e7f2785f · e7f2785f
Commit e7f2785f authored Nov 13, 2025 by zhuwenwen
7 changed files
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -297,9 +297,14 @@ class Attention(nn.Module, AttentionLayerBase):
        if self.use_output:
            output_shape = (output_shape
                            if output_shape is not None else query.shape)
-            output = torch.zeros(output_shape,
-                                 dtype=output_dtype,
-                                 device=query.device)
+            if envs.VLLM_USE_OPT_ZEROS:
+                output = torch.empty(output_shape,
+                                    dtype=query.dtype,
+                                    device=query.device)
+            else:
+                output = torch.zeros(output_shape,
+                                    dtype=query.dtype,
+                                    device=query.device)
            hidden_size = output_shape[-1]
            # We skip reshaping query, key and value tensors for the MLA
            # backend since these tensors have different semantics and are

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -199,7 +199,7 @@ class ModelConfig:
    """Whether to disable sliding window. If True, we will disable the sliding
    window functionality of the model, capping to sliding window size. If the
    model does not support sliding window, this argument is ignored."""
-    disable_cascade_attn: bool = True
+    disable_cascade_attn: bool = False
    """Disable cascade attention for V1. While cascade attention does not
    change the mathematical correctness, disabling it could be useful for
    preventing potential numerical issues. Note that even if this is set to

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1639,7 +1639,7 @@ class EngineArgs:
            # TODO(woosuk): Tune the default values for other hardware.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 8192,
-                UsageContext.OPENAI_API_SERVER: 2048,
+                UsageContext.OPENAI_API_SERVER: 10240,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 256,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -227,6 +227,7 @@ if TYPE_CHECKING:
    VLLM_USE_APEX_RN: bool = False
    VLLM_USE_GLOBAL_CACHE13: bool = False
    VLLM_USE_LIGHTOP: bool = False
+    VLLM_USE_OPT_ZEROS: bool = False
    VLLM_USE_OPT_CAT: bool = False
    VLLM_USE_OPT_MOE_SUM: bool = False
    VLLM_USE_LIGHTOP_MOE_SUM: bool = False
@@ -1605,6 +1606,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
        lambda: (os.environ.get("VLLM_USE_LIGHTOP", "False").lower() in
                 ("true", "1")),
        
+    # vLLM will use elenmentwise not triton_
+    "VLLM_USE_OPT_ZEROS":
+        lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "False").lower() in
+                 ("true", "1")),
+        
    # vLLM will use opt cat for deepseek-v3
    "VLLM_USE_OPT_CAT":
        lambda: (os.environ.get("VLLM_USE_OPT_CAT", "True").lower() in

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1793,7 +1793,8 @@ class FusedMoE(CustomOp):
            assert topk_group is not None
            assert num_expert_group is not None
            if use_fused_gate:
-                if envs.VLLM_USE_LIGHTOP:
+                # if envs.VLLM_USE_LIGHTOP:
+                if False:
                    from lightop import op as op
                    topk_weights, topk_ids = op.moe_fused_gate(
                        router_logits,

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -194,8 +194,8 @@ def _get_model_architecture(
                os.environ['LM_NN'] = '1'
                
            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
-                # if not envs.is_set("VLLM_USE_LIGHTOP"):
-                #     os.environ['VLLM_USE_LIGHTOP'] = '1'
+                if not envs.is_set("VLLM_USE_LIGHTOP"):
+                    os.environ['VLLM_USE_LIGHTOP'] = '1'
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'


--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -284,7 +284,7 @@ class Qwen3MoeAttention(nn.Module):
        if envs.VLLM_USE_APEX_RN:
            q_by_head = self.q_norm.forward_apex(q_by_head)
        else:
-            q_by_head = self.q_norm(q_by_head)
+            q_by_head = self.q_norm.forward_cuda(q_by_head)
        q = q_by_head.view(q.shape)

        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
@@ -292,7 +292,7 @@ class Qwen3MoeAttention(nn.Module):
        if envs.VLLM_USE_APEX_RN:
            k_by_head = self.k_norm.forward_apex(k_by_head)
        else:
-            k_by_head = self.k_norm(k_by_head)
+            k_by_head = self.k_norm.forward_cuda(k_by_head)
        k = k_by_head.view(k.shape)
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)