Commit 9ff617d7 authored by zhuwenwen's avatar zhuwenwen
Browse files

[Perf] Change default CUDAGraphMode from FULL_AND_PIECEWISE to PIECEWISE

parent fd8764b3
...@@ -61,6 +61,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY ...@@ -61,6 +61,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.runai_utils import is_runai_obj_uri from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid from vllm.utils import random_uuid
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm import envs
if TYPE_CHECKING: if TYPE_CHECKING:
from _typeshed import DataclassInstance from _typeshed import DataclassInstance
...@@ -371,15 +372,19 @@ class VllmConfig: ...@@ -371,15 +372,19 @@ class VllmConfig:
if self.compilation_config.cudagraph_mode is None: if self.compilation_config.cudagraph_mode is None:
if envs.VLLM_USE_V1 and self.compilation_config.level \ if envs.VLLM_USE_V1 and self.compilation_config.level \
== CompilationLevel.PIECEWISE: == CompilationLevel.PIECEWISE:
# default to full and piecewise for most models if not envs.VLLM_USE_PIECEWISE:
self.compilation_config.cudagraph_mode = \ # default to full and piecewise for most models
CUDAGraphMode.FULL_AND_PIECEWISE self.compilation_config.cudagraph_mode = \
CUDAGraphMode.FULL_AND_PIECEWISE
# pooling models and encoder-decoder models
# do not support full cudagraphs # pooling models and encoder-decoder models
if self.model_config is not None and \ # do not support full cudagraphs
(self.model_config.pooler_config is not None if self.model_config is not None and \
or self.model_config.is_encoder_decoder): (self.model_config.pooler_config is not None
or self.model_config.is_encoder_decoder):
self.compilation_config.cudagraph_mode = \
CUDAGraphMode.PIECEWISE
else:
self.compilation_config.cudagraph_mode = \ self.compilation_config.cudagraph_mode = \
CUDAGraphMode.PIECEWISE CUDAGraphMode.PIECEWISE
else: else:
......
...@@ -14,7 +14,6 @@ from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass ...@@ -14,7 +14,6 @@ from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.config.utils import config from vllm.config.utils import config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
from vllm import envs
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -57,7 +56,7 @@ class CUDAGraphMode(enum.Enum): ...@@ -57,7 +56,7 @@ class CUDAGraphMode(enum.Enum):
def max_cudagraph_mode(self) -> 'CUDAGraphMode': def max_cudagraph_mode(self) -> 'CUDAGraphMode':
return CUDAGraphMode(max( return CUDAGraphMode(max(
self.value) if not envs.VLLM_USE_PIECEWISE else min(self.value)) if self.separate_routine() else self self.value)) if self.separate_routine() else self
def has_full_cudagraphs(self) -> bool: def has_full_cudagraphs(self) -> bool:
return self.max_cudagraph_mode() == CUDAGraphMode.FULL return self.max_cudagraph_mode() == CUDAGraphMode.FULL
......
...@@ -1657,7 +1657,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1657,7 +1657,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
("true", "1")), ("true", "1")),
# vLLM will use piecewise # vLLM will use piecewise
"VLLM_USE_PIECEWISE": "VLLM_USE_PIECEWISE":
lambda: (os.environ.get("VLLM_USE_PIECEWISE", "False").lower() in lambda: (os.environ.get("VLLM_USE_PIECEWISE", "True").lower() in
("true", "1")), ("true", "1")),
# vllm will use encoding_dsv32.py for dpsk-v32 # vllm will use encoding_dsv32.py for dpsk-v32
"VLLM_USE_V32_ENCODE": "VLLM_USE_V32_ENCODE":
......
...@@ -3041,7 +3041,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -3041,7 +3041,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# TODO(luka) better system for describing dummy batches # TODO(luka) better system for describing dummy batches
seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1] seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
else: else:
seq_lens = max_query_len if not envs.VLLM_USE_PIECEWISE:
seq_lens = max_query_len
else:
# Make sure max_model_len is used at the graph capture time.
seq_lens = self.max_model_len
self.seq_lens.np[:num_reqs] = seq_lens self.seq_lens.np[:num_reqs] = seq_lens
self.seq_lens.np[num_reqs:] = 0 self.seq_lens.np[num_reqs:] = 0
self.seq_lens.copy_to_gpu() self.seq_lens.copy_to_gpu()
...@@ -3662,25 +3666,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -3662,25 +3666,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
CUDAGraphMode.FULL_DECODE_ONLY CUDAGraphMode.FULL_DECODE_ONLY
logger.warning(msg) logger.warning(msg)
# check that if we are doing decode full-cudagraphs it is supported if not envs.VLLM_USE_PIECEWISE:
if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL # check that if we are doing decode full-cudagraphs it is supported
and min_cg_support == AttentionCGSupport.NEVER): if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported " and min_cg_support == AttentionCGSupport.NEVER):
f"with {min_cg_builder_name} backend (support: " msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
f"{min_cg_support})") f"with {min_cg_builder_name} backend (support: "
if (self.compilation_config.level == CompilationLevel.PIECEWISE and f"{min_cg_support})")
(self.compilation_config.splitting_ops_contain_attention() if (self.compilation_config.level == CompilationLevel.PIECEWISE and
or self.compilation_config.use_inductor_graph_partition)): (self.compilation_config.splitting_ops_contain_attention()
msg += "; setting cudagraph_mode=PIECEWISE because "\ or self.compilation_config.use_inductor_graph_partition)):
"attention is compiled piecewise" msg += "; setting cudagraph_mode=PIECEWISE because "\
cudagraph_mode = self.compilation_config.cudagraph_mode = \ "attention is compiled piecewise"
CUDAGraphMode.PIECEWISE cudagraph_mode = self.compilation_config.cudagraph_mode = \
else: CUDAGraphMode.PIECEWISE
msg += "; setting cudagraph_mode=NONE because "\ else:
"attention is not compiled piecewise" msg += "; setting cudagraph_mode=NONE because "\
cudagraph_mode = self.compilation_config.cudagraph_mode = \ "attention is not compiled piecewise"
CUDAGraphMode.NONE cudagraph_mode = self.compilation_config.cudagraph_mode = \
logger.warning(msg) CUDAGraphMode.NONE
logger.warning(msg)
# check that if we are doing spec-decode + decode full-cudagraphs it is # check that if we are doing spec-decode + decode full-cudagraphs it is
# supported # supported
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment