add VLLM_USE_OPT_ZEROS to replace triton_ (torch.zeros)

b364c176 · zhuwenwen · c340a5df · b364c176 · b364c176
Commit b364c176 authored Nov 13, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 3 deletions

vllm/attention/layer.py vllm/attention/layer.py +8 -3

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -219,9 +219,14 @@ class Attention(nn.Module):
        if self.use_output:
            output_shape = (output_shape
                            if output_shape is not None else query.shape)
-            output = torch.zeros(output_shape,
+            if envs.VLLM_USE_OPT_ZEROS:
-                                dtype=query.dtype,
+                output = torch.empty(output_shape,
-                                device=query.device)
+                                    dtype=query.dtype,
+                                    device=query.device)
+            else:
+                output = torch.zeros(output_shape,
+                                    dtype=query.dtype,
+                                    device=query.device)
            hidden_size = output_shape[-1]
            # We skip reshaping query, key and value tensors for the MLA
            # backend since these tensors have different semantics and are

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -165,6 +165,7 @@ if TYPE_CHECKING:
    VLLM_USE_APEX_RN: bool = False
    VLLM_USE_GLOBAL_CACHE13: bool = False
    VLLM_USE_LIGHTOP: bool = False
+    VLLM_USE_OPT_ZEROS: bool = False
    VLLM_USE_OPT_CAT: bool = False
    VLLM_USE_OPT_MOE_SUM: bool = False
    VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD: bool = False
@@ -1105,6 +1106,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_LIGHTOP":
        lambda: (os.environ.get("VLLM_USE_LIGHTOP", "False").lower() in
                 ("true", "1")),
+    # vLLM will use elenmentwise not triton_
+    "VLLM_USE_OPT_ZEROS":
+        lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "False").lower() in
+                 ("true", "1")),
    # vLLM will use opt cat for deepseek-v3
    "VLLM_USE_OPT_CAT":
        lambda: (os.environ.get("VLLM_USE_OPT_CAT", "False").lower() in