[Hybrid] Simplify accepted token counting in spec decode for hybrid models (#38372)

507df79a · Francesco Fusco · GitHub · 1696c864 · 507df79a
Unverified Commit 507df79a authored Apr 15, 2026 by Francesco Fusco Committed by GitHub Apr 14, 2026
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 19 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +4 -19

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1428,26 +1428,11 @@ class GPUModelRunner(
        # TODO: Remove .cpu() sync to enable fully async for hybrid model;
        # Use num_computed_tokens.gpu instead of req.num_computed_tokens to
        # support aligned mamba cache mode.
-        # Find the number of accepted tokens for each sequence.
+        # Count the number of accepted tokens for each sequence.
+        # Valid tokens are contiguous from position 0, so counting non-(-1)
+        # tokens gives us the first -1 position (i.e., number of accepted).
        num_reqs = output_token_ids.size(0)
-        self.num_accepted_tokens.gpu[:num_reqs] = (
+        self.num_accepted_tokens.gpu[:num_reqs] = (output_token_ids != -1).sum(dim=1)
-            (
-                torch.cat(
-                    [
-                        output_token_ids,
-                        torch.full(
-                            (num_reqs, 1),
-                            -1,
-                            device=output_token_ids.device,
-                        ),
-                    ],
-                    dim=1,
-                )
-                == -1
-            )
-            .int()
-            .argmax(-1)
-        )
        if self.cache_config.mamba_cache_mode == "align":
            for i, num_tokens in enumerate(