Commit 9be76efd authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_PIECEWISE to use piecewise

parent 77599fa7
...@@ -14,6 +14,7 @@ from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass ...@@ -14,6 +14,7 @@ from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.config.utils import config from vllm.config.utils import config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
from vllm import envs
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -56,7 +57,7 @@ class CUDAGraphMode(enum.Enum): ...@@ -56,7 +57,7 @@ class CUDAGraphMode(enum.Enum):
def max_cudagraph_mode(self) -> 'CUDAGraphMode': def max_cudagraph_mode(self) -> 'CUDAGraphMode':
return CUDAGraphMode(max( return CUDAGraphMode(max(
self.value)) if self.separate_routine() else self self.value) if not envs.VLLM_USE_PIECEWISE else min(self.value)) if self.separate_routine() else self
def has_full_cudagraphs(self) -> bool: def has_full_cudagraphs(self) -> bool:
return self.max_cudagraph_mode() == CUDAGraphMode.FULL return self.max_cudagraph_mode() == CUDAGraphMode.FULL
......
...@@ -237,6 +237,7 @@ if TYPE_CHECKING: ...@@ -237,6 +237,7 @@ if TYPE_CHECKING:
USE_FUSED_SILU_MUL_QUANT: bool = False USE_FUSED_SILU_MUL_QUANT: bool = False
VLLM_USE_PD_SPLIT: bool = False VLLM_USE_PD_SPLIT: bool = False
VLLM_USE_PP_SYNC: bool = False VLLM_USE_PP_SYNC: bool = False
VLLM_USE_PIECEWISE: bool = False
def get_default_cache_root(): def get_default_cache_root():
...@@ -1648,6 +1649,11 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1648,6 +1649,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_PP_SYNC": "VLLM_USE_PP_SYNC":
lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in
("true", "1")), ("true", "1")),
# vLLM will use piecewise
"VLLM_USE_PIECEWISE":
lambda: (os.environ.get("VLLM_USE_PIECEWISE", "True").lower() in
("true", "1")),
} }
# --8<-- [end:env-vars-definition] # --8<-- [end:env-vars-definition]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment