[feat]添加VLLM_SPEC_DECODE_EAGER环境变量，用于选择draft model是否强制使用eager模式，在hygon cpu上ds3 mtp提升较大

7488257b · 王敏 · 18ec9eaa · 7488257b · 7488257b
Commit 7488257b authored Mar 28, 2025 by 王敏
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

vllm/config.py vllm/config.py +1 -1

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1854,7 +1854,7 @@ class SpeculativeConfig:
                max_model_len=None,
                spec_target_max_model_len=target_model_config.max_model_len,
                quantization=draft_quantization,
-                enforce_eager=target_model_config.enforce_eager,
+                enforce_eager=True if envs.VLLM_SPEC_DECODE_EAGER else target_model_config.enforce_eager,
                max_seq_len_to_capture=target_model_config.
                max_seq_len_to_capture,
                max_logprobs=target_model_config.max_logprobs,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -94,6 +94,7 @@ if TYPE_CHECKING:
    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
    VLLM_RAY_BUNDLE_INDICES: str = ""
+    VLLM_SPEC_DECODE_EAGER: bool = False
 def get_default_cache_root():
@@ -613,6 +614,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # models the alignment is already naturally aligned to 256 bytes.
    "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
    lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
+    # If set, vLLM will disable the draft model in cudagraph mode.
+    "VLLM_SPEC_DECODE_EAGER":
+    lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),
 }
 # end-env-vars-definition