[feat]添加VLLM_ENFORCE_EAGER_BS_THRESHOLD环境变量，支持cudagraph模式下，当bs大于阈值时，强制切换为eager模式，对大bs有效果

3ce3e3d9 · 王敏 · 6f49c1ed · 3ce3e3d9 · 3ce3e3d9
Commit 3ce3e3d9 authored Apr 15, 2025 by 王敏
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 1 deletion

vllm/envs.py vllm/envs.py +5 -0

vllm/worker/model_runner.py vllm/worker/model_runner.py +7 -1

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -95,6 +95,7 @@ if TYPE_CHECKING:
    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
    VLLM_RAY_BUNDLE_INDICES: str = ""
    VLLM_SPEC_DECODE_EAGER: bool = False
+    VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None


 def get_default_cache_root():
@@ -618,6 +619,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # If set, vLLM will disable the draft model in cudagraph mode.
    "VLLM_SPEC_DECODE_EAGER":
    lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),
+
+    # If set, vLLM will disable the draft model in cudagraph mode.
+    "VLLM_ENFORCE_EAGER_BS_THRESHOLD":
+    lambda: int(os.environ.get("VLLM_ENFORCE_EAGER_BS_THRESHOLD", "-1")),
 }

 # end-env-vars-definition

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
 # SPDX-License-Identifier: Apache-2.0

+import sys
 import dataclasses
 import gc
 import inspect
@@ -1109,6 +1110,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
            # multi-step model runner does not have `_builder_cls`
            self.builder = self._builder_cls(weakref.proxy(self))

+        self.enforce_eager_bs_threshould = sys.maxsize
+        if envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD is not None and envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD > 0:
+            self.enforce_eager_bs_threshould = envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD
+
    def load_model(self) -> None:
        logger.info("Starting to load model %s...", self.model_config.model)
        with DeviceMemoryProfiler() as m:
@@ -1680,7 +1685,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
        # TODO(andoorve): We can remove this once all
        # virtual engines share the same kv cache.
        virtual_engine = model_input.virtual_engine
-        if prefill_meta is None and decode_meta.use_cuda_graph:
+        if prefill_meta is None and decode_meta.use_cuda_graph and \
+                model_input.input_tokens.shape[0] <= self.enforce_eager_bs_threshould:
            assert model_input.input_tokens is not None
            graph_batch_size = model_input.input_tokens.shape[0]
            model_executable = self.graph_runners[virtual_engine][