[V1][Minor] Minor simplification for get_computed_blocks (#16139)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V1][Minor] Minor simplification for get_computed_blocks (#16139)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
3749e287 · Woosuk Kwon · GitHub · 86fc2321 · 3749e287
Unverified Commit 3749e287 authored Apr 06, 2025 by Woosuk Kwon Committed by GitHub Apr 06, 2025
Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 30 deletions

vllm/v1/core/kv_cache_manager.py vllm/v1/core/kv_cache_manager.py +31 -30

No files found.
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -126,7 +126,10 @@ class KVCacheManager:
            self.req_to_block_hashes[request.request_id] = block_hashes

        self.prefix_cache_stats.requests += 1
-        if request.sampling_params.prompt_logprobs is None:
+        # When the request requires prompt logprobs, we skip prefix caching.
+        if request.sampling_params.prompt_logprobs is not None:
+            return [], 0
+
        if len(block_hashes) * self.block_size == request.num_tokens:
            # When prompt length is divisible by the block size and all
            # blocks are cached, we need to recompute the last token. This
@@ -142,22 +145,20 @@ class KVCacheManager:

        computed_blocks = (
            self.specialized_manager.find_longest_cache_hit(block_hashes))
+        self.prefix_cache_stats.queries += len(block_hashes)
+        self.prefix_cache_stats.hits += len(computed_blocks)

        if last_block_hash is not None:
            # Add back the last block hash if it was removed.
+            # NOTE: Because block_hashes is cached in req_to_block_hashes,
+            # we shouldn't modify it directly.
            block_hashes.append(last_block_hash)

-            self.prefix_cache_stats.queries += len(block_hashes)
-            self.prefix_cache_stats.hits += len(computed_blocks)
-
        # NOTE(woosuk): Since incomplete blocks are not eligible for
        # sharing, `num_computed_tokens` is always a multiple of
        # `block_size`.
        num_computed_tokens = len(computed_blocks) * self.block_size
        return computed_blocks, num_computed_tokens
-        else:
-            # Skip cache hits for prompt logprobs
-            return [], 0

    def allocate_slots(
        self,