add VLLM_USE_PP_SYNC to use pp sync

a1e9e36f · zhuwenwen · 8375370f · a1e9e36f · a1e9e36f
Commit a1e9e36f authored Nov 14, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 0 deletions

vllm/distributed/parallel_state.py vllm/distributed/parallel_state.py +2 -0

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -735,6 +735,8 @@ class GroupCoordinator:
                    torch.distributed.recv(tensor,
                                           src=self.ranks[src],
                                           group=group)
+                if envs.VLLM_USE_PP_SYNC:
+                    torch.cuda.sychronize()
                if use_all_gather:
                    # do the allgather
                    tensor = all_gather_group.all_gather(  # type: ignore

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -178,6 +178,7 @@ if TYPE_CHECKING:
    VLLM_P2P_BUF_TOKENS: int = 30000
    VLLM_SCHED_ENABLE_MINIMAL_INJECTION: bool = False
    VLLM_USE_PD_SPLIT: bool = False
+    VLLM_USE_PP_SYNC: bool = False
 def get_default_cache_root():
    return os.getenv(
@@ -1156,6 +1157,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_PD_SPLIT":
        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "True").lower() in
                 ("true", "1")), 
+    # vLLM will sync to avoid pp vmfault
+    "VLLM_USE_PP_SYNC":
+        lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in
+                 ("true", "1")), 
 }
 # --8<-- [end:env-vars-definition]