Commit d0de006f authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.7.2-dev-wm' into 'v0.7.2-dev'

[feat]添加VLLM_ENFORCE_EAGER_BS_THRESHOLD环境变量,支持cudagraph模式下,当bs大于阈值时,强制切换为eager模式,对大bs有效果

See merge request dcutoolkit/deeplearing/vllm!95
parents 6f49c1ed 3ce3e3d9
...@@ -95,6 +95,7 @@ if TYPE_CHECKING: ...@@ -95,6 +95,7 @@ if TYPE_CHECKING:
VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_RAY_BUNDLE_INDICES: str = ""
VLLM_SPEC_DECODE_EAGER: bool = False VLLM_SPEC_DECODE_EAGER: bool = False
VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None
def get_default_cache_root(): def get_default_cache_root():
...@@ -618,6 +619,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -618,6 +619,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the draft model in cudagraph mode. # If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_SPEC_DECODE_EAGER": "VLLM_SPEC_DECODE_EAGER":
lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))), lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_ENFORCE_EAGER_BS_THRESHOLD":
lambda: int(os.environ.get("VLLM_ENFORCE_EAGER_BS_THRESHOLD", "-1")),
} }
# end-env-vars-definition # end-env-vars-definition
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import sys
import dataclasses import dataclasses
import gc import gc
import inspect import inspect
...@@ -1109,6 +1110,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1109,6 +1110,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# multi-step model runner does not have `_builder_cls` # multi-step model runner does not have `_builder_cls`
self.builder = self._builder_cls(weakref.proxy(self)) self.builder = self._builder_cls(weakref.proxy(self))
self.enforce_eager_bs_threshould = sys.maxsize
if envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD is not None and envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD > 0:
self.enforce_eager_bs_threshould = envs.VLLM_ENFORCE_EAGER_BS_THRESHOLD
def load_model(self) -> None: def load_model(self) -> None:
logger.info("Starting to load model %s...", self.model_config.model) logger.info("Starting to load model %s...", self.model_config.model)
with DeviceMemoryProfiler() as m: with DeviceMemoryProfiler() as m:
...@@ -1680,7 +1685,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): ...@@ -1680,7 +1685,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
# TODO(andoorve): We can remove this once all # TODO(andoorve): We can remove this once all
# virtual engines share the same kv cache. # virtual engines share the same kv cache.
virtual_engine = model_input.virtual_engine virtual_engine = model_input.virtual_engine
if prefill_meta is None and decode_meta.use_cuda_graph: if prefill_meta is None and decode_meta.use_cuda_graph and \
model_input.input_tokens.shape[0] <= self.enforce_eager_bs_threshould:
assert model_input.input_tokens is not None assert model_input.input_tokens is not None
graph_batch_size = model_input.input_tokens.shape[0] graph_batch_size = model_input.input_tokens.shape[0]
model_executable = self.graph_runners[virtual_engine][ model_executable = self.graph_runners[virtual_engine][
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment