Commit 87223113 authored by zhuwenwen's avatar zhuwenwen
Browse files

[feat]添加VLLM_SPEC_DECODE_EAGER环境变量,用于选择draft model是否强制使用eager模式,在hygon cpu上ds3 mtp提升较大

parent 469e903b
......@@ -2072,7 +2072,7 @@ class SpeculativeConfig:
spec_target_max_model_len=self.target_model_config.
max_model_len,
quantization=self.quantization,
enforce_eager=self.target_model_config.enforce_eager,
enforce_eager=True if envs.VLLM_SPEC_DECODE_EAGER else self.target_model_config.enforce_eager,
max_seq_len_to_capture=self.target_model_config.
max_seq_len_to_capture,
max_logprobs=self.target_model_config.max_logprobs,
......
......@@ -20,6 +20,7 @@ if TYPE_CHECKING:
VLLM_USE_OPT_OP: bool = False
VLLM_USE_TC_PAGED_ATTN: bool = False
VLLM_USE_PA_PRINT_PARAM: bool = False
VLLM_SPEC_DECODE_EAGER: bool = False
VLLM_FLASH_ATTN_VERSION: Optional[int] = None
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None
......@@ -269,6 +270,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_PA_PRINT_PARAM":
lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
("true", "1")),
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_SPEC_DECODE_EAGER":
lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),
# Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment