Commit 7488257b authored by 王敏's avatar 王敏
Browse files

[feat]添加VLLM_SPEC_DECODE_EAGER环境变量,用于选择draft model是否强制使用eager模式,在hygon cpu上ds3 mtp提升较大

parent 18ec9eaa
...@@ -1854,7 +1854,7 @@ class SpeculativeConfig: ...@@ -1854,7 +1854,7 @@ class SpeculativeConfig:
max_model_len=None, max_model_len=None,
spec_target_max_model_len=target_model_config.max_model_len, spec_target_max_model_len=target_model_config.max_model_len,
quantization=draft_quantization, quantization=draft_quantization,
enforce_eager=target_model_config.enforce_eager, enforce_eager=True if envs.VLLM_SPEC_DECODE_EAGER else target_model_config.enforce_eager,
max_seq_len_to_capture=target_model_config. max_seq_len_to_capture=target_model_config.
max_seq_len_to_capture, max_seq_len_to_capture,
max_logprobs=target_model_config.max_logprobs, max_logprobs=target_model_config.max_logprobs,
......
...@@ -94,6 +94,7 @@ if TYPE_CHECKING: ...@@ -94,6 +94,7 @@ if TYPE_CHECKING:
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_RAY_BUNDLE_INDICES: str = ""
VLLM_SPEC_DECODE_EAGER: bool = False
def get_default_cache_root(): def get_default_cache_root():
...@@ -613,6 +614,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -613,6 +614,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# models the alignment is already naturally aligned to 256 bytes. # models the alignment is already naturally aligned to 256 bytes.
"VLLM_CUDA_MEM_ALIGN_KV_CACHE": "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))), lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_SPEC_DECODE_EAGER":
lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),
} }
# end-env-vars-definition # end-env-vars-definition
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment