[Bugfix] Fix torchrun PP broadcast deadlock with async scheduling (#33701)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>

[Bugfix] Fix torchrun PP broadcast deadlock with async scheduling (#33701)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
02080179 · Isotr0py · GitHub · 1b8fe6f7 · 02080179 · 02080179
Unverified Commit 02080179 authored Feb 04, 2026 by Isotr0py Committed by GitHub Feb 04, 2026
3 changed files
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -32,9 +32,6 @@ llm = LLM(
    gpu_memory_utilization=random.uniform(0.7, 0.9),
    swap_space=random.randint(1, 4),
    seed=0,
-    # FIXME(Isotr0py): async scheduling causes deadlock
-    # on torchrun with PP, need to investigate further.
-    async_scheduling=False,
 )

 outputs = llm.generate(prompts, sampling_params)

--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@@ -39,9 +39,6 @@ llm = LLM(
    gpu_memory_utilization=random.uniform(0.7, 0.9),
    swap_space=random.randint(1, 4),
    seed=0,
-    # FIXME(Isotr0py): async scheduling causes deadlock
-    # on torchrun with PP, need to investigate further.
-    async_scheduling=False,
 )

 outputs = llm.generate(prompts, sampling_params)

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3666,7 +3666,10 @@ class GPUModelRunner(
        )
        if self.use_async_scheduling:
            pp = get_pp_group()
-            if pp.world_size > 1 and pp.is_last_rank:
+            # For torchrun external_launcher PP mode with broadcast_pp_output=True,
+            # PP outputs have been broadcasted to all ranks at logits computation.
+            # Therefore, here is no need to send sampled token ids again in this case.
+            if not self.broadcast_pp_output and pp.world_size > 1 and pp.is_last_rank:
                self._pp_broadcast_prev_sampled_token_ids(
                    sampler_output.sampled_token_ids
                )