Commit d0de006f authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.7.2-dev-wm' into 'v0.7.2-dev'

[feat]添加VLLM_ENFORCE_EAGER_BS_THRESHOLD环境变量,支持cudagraph模式下,当bs大于阈值时,强制切换为eager模式,对大bs有效果

See merge request dcutoolkit/deeplearing/vllm!95
parents 6f49c1ed 3ce3e3d9
......@@ -95,6 +95,7 @@ if TYPE_CHECKING:
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = ""
VLLM_SPEC_DECODE_EAGER: bool = False
VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None
def get_default_cache_root():
......@@ -618,6 +619,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_SPEC_DECODE_EAGER":
lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_ENFORCE_EAGER_BS_THRESHOLD":
lambda: int(os.environ.get("VLLM_ENFORCE_EAGER_BS_THRESHOLD", "-1")),
}
# end-env-vars-definition
......
# SPDX-License-Identifier: Apache-2.0
import sys
import dataclasses
import gc
import inspect
......@@ -1109,6 +1110,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# multi-step model runner does not have `_builder_cls`
self.builder = self._builder_cls(weakref.proxy(self))
self.enforce_eager_bs_threshould = sys.maxsize
if envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD is not None and envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD > 0:
self.enforce_eager_bs_threshould = envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD
def load_model(self) -> None:
logger.info("Starting to load model %s...", self.model_config.model)
with DeviceMemoryProfiler() as m:
......@@ -1680,7 +1685,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
# TODO(andoorve): We can remove this once all
# virtual engines share the same kv cache.
virtual_engine = model_input.virtual_engine
if prefill_meta is None and decode_meta.use_cuda_graph:
if prefill_meta is None and decode_meta.use_cuda_graph and \
model_input.input_tokens.shape[0] <= self.enforce_eager_bs_threshould:
assert model_input.input_tokens is not None
graph_batch_size = model_input.input_tokens.shape[0]
model_executable = self.graph_runners[virtual_engine][
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment