[Model Runner V2] A bit more PP simplification (#34766)

Signed-off-by: Nick Hill <nickhill123@gmail.com>

[Model Runner V2] A bit more PP simplification (#34766)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
a49ea5a5 · Nick Hill · GitHub · 30ebe0dc · a49ea5a5 · a49ea5a5
Unverified Commit a49ea5a5 authored Feb 17, 2026 by Nick Hill Committed by GitHub Feb 17, 2026
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 12 deletions

vllm/v1/worker/gpu/model_runner.py vllm/v1/worker/gpu/model_runner.py +5 -7

vllm/v1/worker/gpu/pp_utils.py vllm/v1/worker/gpu/pp_utils.py +3 -5

No files found.
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -1003,15 +1003,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        hidden_states, input_batch, kv_connector_output = self.execute_model_state
        self.execute_model_state = None  # type: ignore

+        if not self.is_last_pp_rank:
            # Non-last PP rank: hidden_states is None because this rank produced
            # IntermediateTensors instead of final hidden states. Receive the
-        # sampled tokens broadcast by the last rank and update local state.
-        if not self.is_last_pp_rank:
-            received = pp_receive(
+            # sampled tokens broadcast from the last rank and update local state.
+            sampled, num_sampled, num_rejected = pp_receive(
                input_batch.num_reqs, max_sample_len=self.num_speculative_steps + 1
            )
-            assert received is not None
-            sampled, num_sampled, num_rejected = received
            self.postprocess(input_batch, sampled, num_sampled, num_rejected)
            return None

@@ -1020,8 +1018,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            hidden_states, input_batch, grammar_output
        )

-        # Broadcast to non-last PP ranks (handles spec decode multi-token).
        if self.use_pp:
+            # Broadcast to non-last PP ranks (handles spec decode multi-token).
            pp_broadcast(sampler_output.sampled_token_ids, num_sampled, num_rejected)

        prompt_logprobs_dict = self.prompt_logprobs_worker.compute_prompt_logprobs(

--- a/vllm/v1/worker/gpu/pp_utils.py
+++ b/vllm/v1/worker/gpu/pp_utils.py
@@ -13,8 +13,7 @@ def pp_broadcast(
    num_rejected: torch.Tensor,
 ) -> None:
    pp = get_pp_group()
-    if not pp.is_last_rank:
-        return
+    assert pp.is_last_rank

    assert sampled_token_ids.dtype == torch.int64
    torch.distributed.broadcast(
@@ -27,10 +26,9 @@ def pp_broadcast(

 def pp_receive(
    num_reqs: int, max_sample_len: int = 1
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    pp = get_pp_group()
-    if pp.is_last_rank:
-        return None
+    assert not pp.is_last_rank

    sampled_tokens = torch.empty(
        num_reqs, max_sample_len, dtype=torch.int64, device=pp.device