Commit a1e9e36f authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_PP_SYNC to use pp sync

parent 8375370f
...@@ -735,6 +735,8 @@ class GroupCoordinator: ...@@ -735,6 +735,8 @@ class GroupCoordinator:
torch.distributed.recv(tensor, torch.distributed.recv(tensor,
src=self.ranks[src], src=self.ranks[src],
group=group) group=group)
if envs.VLLM_USE_PP_SYNC:
torch.cuda.sychronize()
if use_all_gather: if use_all_gather:
# do the allgather # do the allgather
tensor = all_gather_group.all_gather( # type: ignore tensor = all_gather_group.all_gather( # type: ignore
......
...@@ -178,6 +178,7 @@ if TYPE_CHECKING: ...@@ -178,6 +178,7 @@ if TYPE_CHECKING:
VLLM_P2P_BUF_TOKENS: int = 30000 VLLM_P2P_BUF_TOKENS: int = 30000
VLLM_SCHED_ENABLE_MINIMAL_INJECTION: bool = False VLLM_SCHED_ENABLE_MINIMAL_INJECTION: bool = False
VLLM_USE_PD_SPLIT: bool = False VLLM_USE_PD_SPLIT: bool = False
VLLM_USE_PP_SYNC: bool = False
def get_default_cache_root(): def get_default_cache_root():
return os.getenv( return os.getenv(
...@@ -1156,6 +1157,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1156,6 +1157,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_PD_SPLIT": "VLLM_USE_PD_SPLIT":
lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "True").lower() in lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "True").lower() in
("true", "1")), ("true", "1")),
# vLLM will sync to avoid pp vmfault
"VLLM_USE_PP_SYNC":
lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in
("true", "1")),
} }
# --8<-- [end:env-vars-definition] # --8<-- [end:env-vars-definition]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment