add VLLM_USE_PD_SPLIT to split prefill and decode

97fed613 · zhuwenwen · 3912d41c · 97fed613 · 97fed613
Commit 97fed613 authored Nov 13, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

vllm/envs.py vllm/envs.py +6 -0

vllm/v1/core/sched/scheduler.py vllm/v1/core/sched/scheduler.py +1 -1

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -176,6 +176,7 @@ if TYPE_CHECKING:
    VLLM_P2P_ASYNC: bool = False
    VLLM_P2P_BUF_TOKENS: int = 30000
    VLLM_SCHED_ENABLE_MINIMAL_INJECTION: bool = False
+    VLLM_USE_PD_SPLIT: bool = False
 def get_default_cache_root():
    return os.getenv(
@@ -1157,6 +1158,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_SCHED_ENABLE_MINIMAL_INJECTION":
        lambda: (os.getenv("VLLM_SCHED_ENABLE_MINIMAL_INJECTION", "0").lower() in
                 ("true", "1")),
+    # vLLM will split prefill and decode, not mix up
+    "VLLM_USE_PD_SPLIT":
+        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "True").lower() in
+                 ("true", "1")), 
 }
 # --8<-- [end:env-vars-definition]

--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1032,7 +1032,7 @@ class Scheduler(SchedulerInterface):
        return scheduler_output
    def schedule(self) -> SchedulerOutput:
-        if self.full_cuda_graph and self.use_mla and self.num_spec_tokens > 0:
+        if (self.full_cuda_graph and self.use_mla and self.num_spec_tokens > 0) or envs.VLLM_USE_PD_SPLIT:
            return self.schedule_split_pd()
        else:
            return self.schedule_default()