add VLLM_USE_MERGE_ATTN_STATES_OPT to control merge_attn_states support

2071c380 · zhuwenwen · 48b4c41d · 2071c380 · 2071c380
Commit 2071c380 authored Sep 13, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

vllm/attention/ops/merge_attn_states.py vllm/attention/ops/merge_attn_states.py +2 -1

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -5,6 +5,7 @@ from typing import Optional
 import torch
 from vllm.platforms import current_platform
+from vllm import envs
 def merge_attn_states(
@@ -31,7 +32,7 @@ def merge_attn_states(
            return headdim % 4 == 0
        return headdim % 8 == 0
-    if (current_platform.is_cuda() or current_platform.is_rocm() and supported_dtypes(output)
+    if (current_platform.is_cuda() or envs.VLLM_USE_MERGE_ATTN_STATES_OPT and supported_dtypes(output)
            and supported_headdim(output)):
        from vllm._custom_ops import merge_attn_states
        return merge_attn_states(output, prefix_output, prefix_lse,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -166,6 +166,7 @@ if TYPE_CHECKING:
    VLLM_USE_GLOBAL_CACHE13: bool = False
    VLLM_USE_LIGHT_OP: bool = False
    VLLM_USE_TRITON_CAT: bool = False
+    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
 def get_default_cache_root():
    return os.getenv(
@@ -1099,6 +1100,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_TRITON_CAT":
        lambda: (os.environ.get("VLLM_USE_TRITON_CAT", "True").lower() in
                 ("true", "1")),  
+    # vLLM will use opt merge_aatn_states,not triton
+    "VLLM_USE_MERGE_ATTN_STATES_OPT":
+        lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in
+                 ("true", "1")),  
 }
 # --8<-- [end:env-vars-definition]