[Bugfix][Core] Fix stuck chunked pipeline parallelism with async scheduling (#38726)

Signed-off-by: Jing Wang <jingwang96@qq.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

[Bugfix][Core] Fix stuck chunked pipeline parallelism with async scheduling (#38726)
Signed-off-by: Jing Wang <jingwang96@qq.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
747256bb · Jing Wang · GitHub · 1174723e · 747256bb
Unverified Commit 747256bb authored Apr 18, 2026 by Jing Wang Committed by GitHub Apr 17, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 4 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +18 -4

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3758,6 +3758,15 @@ class GPUModelRunner(
        return slot_mappings_by_gid, slot_mappings_by_layer
+    def _is_all_reqs_chunked_prefill(self) -> bool:
+        """Check if all scheduled requests are marked to discard sampled tokens.
+        This is true when `discard_request_mask` is set for every scheduled
+        request (e.g., for chunked prefill requests that are not the last
+        prefill chunk)."""
+        num_reqs = self.input_batch.num_reqs
+        return bool(self.discard_request_mask.np[:num_reqs].all())
    @torch.inference_mode()
    def execute_model(
        self,
@@ -4361,9 +4370,12 @@ class GPUModelRunner(
        assert sampled_token_ids.dim() == 2 and sampled_token_ids.shape[-1] == 1, (
            "PP+async expects sampled_token_ids to have shape [num_reqs, 1]"
        )
-        torch.distributed.broadcast(
+        # Skip for chunked prefill: sampled tokens are dummy
-            sampled_token_ids, src=pp.rank, group=pp.device_group
+        # and will be discarded, no need to broadcast.
-        )
+        if not self._is_all_reqs_chunked_prefill():
+            torch.distributed.broadcast(
+                sampled_token_ids, src=pp.rank, group=pp.device_group
+            )
    def _pp_receive_prev_sampled_token_ids_to_input_batch(self) -> None:
        """Receive sampled token ids broadcast from last PP stage"""
@@ -4372,7 +4384,9 @@ class GPUModelRunner(
        num_reqs = self.input_batch.num_reqs
        # `prev_sampled_token_ids` is expected to have shape [num_reqs, 1].
        recv = torch.empty((num_reqs, 1), dtype=torch.int32, device=self.device)
-        torch.distributed.broadcast(recv, src=pp.last_rank, group=pp.device_group)
+        # skip for chunked prefill.
+        if not self._is_all_reqs_chunked_prefill():
+            torch.distributed.broadcast(recv, src=pp.last_rank, group=pp.device_group)
        self.input_batch.prev_sampled_token_ids = recv
        # construct `prev_req_id_to_index` here so `_prepare_input_ids`