[v1][Bugfix] Only cache blocks that are not in the prefix cache (#14073)

b9f1d429 · Chen Zhang · GitHub · b28246f6 · b9f1d429 · b9f1d429
Unverified Commit b9f1d429 authored Mar 01, 2025 by Chen Zhang Committed by GitHub Mar 01, 2025
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 22 deletions

vllm/v1/core/block_pool.py vllm/v1/core/block_pool.py +4 -18

vllm/v1/core/kv_cache_manager.py vllm/v1/core/kv_cache_manager.py +5 -4

No files found.
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -107,34 +107,20 @@ class BlockPool:
            assert prev_block.block_hash is not None
            prev_block_hash_value = prev_block.block_hash.hash_value
-        # Find the first uncached block.
+        for i, blk in enumerate(new_full_blocks):
-        # FIXME: num_cached_blocks should be corrected by the caller
-        # so this should never happen.
-        offset = 0
-        for blk in new_full_blocks:
-            if blk.block_hash is None:
-                break
-            else:
-                prev_block_hash_value = blk.block_hash.hash_value
-                offset += 1
-        else:
-            # All blocks are cached.
-            return
-        for i, blk in enumerate(new_full_blocks[offset:]):
-            blk_idx = num_cached_blocks + offset + i
            assert blk.block_hash is None
-            if i + offset < len(new_block_hashes):
+            if i < len(new_block_hashes):
                # The block hash may already be computed in
                # "get_computed_blocks" if the tokens are not generated by
                # this request (either the prompt tokens or the previously
                # generated tokens with preemption). In this case we simply
                # reuse the block hash.
-                block_hash = new_block_hashes[i + offset]
+                block_hash = new_block_hashes[i]
            else:
                # Otherwise compute the block hash and cache it in the request
                # in case it will be preempted in the future.
+                blk_idx = num_cached_blocks + i
                start_token_idx = blk_idx * block_size
                end_token_idx = (blk_idx + 1) * block_size
                block_tokens = request.all_token_ids[

--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -65,7 +65,7 @@ class KVCacheManager:
        # This is used to track the number of cached blocks for each request.
        # This is only used to track the RUNNING requests, we do not track the
        # data for reempted ones.
-        self.num_cached_block: Dict[str, int] = defaultdict(int)
+        self.num_cached_block: Dict[str, int] = {}
        self.prefix_cache_stats = PrefixCacheStats()
    @property
@@ -224,9 +224,10 @@ class KVCacheManager:
        if not self.enable_caching:
            return new_blocks
-        # FIXME: `num_cached_blocks` is not correct when the prefix cache
+        # Use `new_computed_blocks` for a new request, and `num_cached_block`
-        # of a new request is hit.
+        # for a running request.
-        num_cached_blocks = self.num_cached_block[request.request_id]
+        num_cached_blocks = self.num_cached_block.get(request.request_id,
+                                                      len(new_computed_blocks))
        # Speculated tokens might be rejected in the future, so we does
        # not cache any speculated tokens. We only cache blocks with
        # generated (accepted) tokens.