[V1][BugFix] Fix edge case in VLM scheduling (#12065)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V1][BugFix] Fix edge case in VLM scheduling (#12065)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
b7ee940a · Woosuk Kwon · GitHub · 9ddac563 · b7ee940a
Unverified Commit b7ee940a authored Jan 14, 2025 by Woosuk Kwon Committed by GitHub Jan 14, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 11 deletions

vllm/v1/core/scheduler.py vllm/v1/core/scheduler.py +15 -11

No files found.
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -373,18 +373,22 @@ class Scheduler:
            if self.encoder_cache_manager.has_cache(request, i):
                # The encoder input is already computed and cached.
                continue
-            if not self.encoder_cache_manager.can_allocate(request, i):
-                # The encoder cache is full. We can only schedule the decoder
-                # tokens just before the encoder input.
-                num_new_tokens = start_pos - num_computed_tokens
-                break
-            if num_encoder_tokens > encoder_budget:
-                # The encoder budget is exhausted. We can only schedule the
-                # decoder tokens up until the encoder input.
-                # NOTE(woosuk): We assume that the encoder tokens should be
-                # processed altogether, as the encoder usually uses
+            if (not self.encoder_cache_manager.can_allocate(request, i)
+                    or num_encoder_tokens > encoder_budget):
+                # The encoder cache is full or the encoder budget is exhausted.
+                # NOTE(woosuk): We assume that the encoder input tokens should
+                # be processed altogether, as the encoder usually uses
                # bidirectional attention.
-                num_new_tokens = start_pos - num_computed_tokens
+                if num_computed_tokens < start_pos:
+                    # We only schedule the decoder tokens just before the
+                    # encoder input.
+                    num_new_tokens = start_pos - num_computed_tokens
+                else:
+                    # Because of prefix caching, num_computed_tokens is greater
+                    # than start_pos even though its encoder input is not
+                    # available. In this case, we can't schedule any token for
+                    # the request in this step.
+                    num_new_tokens = 0
                break

            encoder_budget -= num_encoder_tokens