[feat]添加VLLM_SPEC_DECODE_EAGER环境变量，用于选择draft model是否强制使用eager模式，在hygon cpu上ds3 mtp提升较大

87223113 · zhuwenwen · 469e903b · 87223113 · 87223113
Commit 87223113 authored Mar 28, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

vllm/config.py vllm/config.py +1 -1

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2072,7 +2072,7 @@ class SpeculativeConfig:
                    spec_target_max_model_len=self.target_model_config.
                    max_model_len,
                    quantization=self.quantization,
-                    enforce_eager=self.target_model_config.enforce_eager,
+                    enforce_eager=True if envs.VLLM_SPEC_DECODE_EAGER else self.target_model_config.enforce_eager,
                    max_seq_len_to_capture=self.target_model_config.
                    max_seq_len_to_capture,
                    max_logprobs=self.target_model_config.max_logprobs,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
    VLLM_USE_OPT_OP: bool = False
    VLLM_USE_TC_PAGED_ATTN: bool = False
    VLLM_USE_PA_PRINT_PARAM: bool = False 
+    VLLM_SPEC_DECODE_EAGER: bool = False
    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
    LOCAL_RANK: int = 0
    CUDA_VISIBLE_DEVICES: Optional[str] = None
@@ -269,6 +270,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_PA_PRINT_PARAM":
    lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
             ("true", "1")),
+    
+    # If set, vLLM will disable the draft model in cudagraph mode.
+    "VLLM_SPEC_DECODE_EAGER":
+    lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),

    # Force vllm to use a specific flash-attention version (2 or 3), only valid
    # when using the flash-attention backend.