fix(lmcache): correct store for cached requests while enable prefix cache (#39719)

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

fix(lmcache): correct store for cached requests while enable prefix cache (#39719)
Signed-off-by: baoloongmao <baoloongmao@tencent.com>
b2f749dc · maobaolong · GitHub · 70ed0155 · b2f749dc
Unverified Commit b2f749dc authored Apr 15, 2026 by maobaolong Committed by GitHub Apr 14, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 7 deletions

vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py ...buted/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +18 -7

No files found.
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -309,14 +309,25 @@ class LMCacheMPRequestMetadata:
        # always be a multiple of `blocks_in_chunk`
        # TODO: This should be checked everytime we update the num_stored_blocks
        #
-        # Why computed_blocks includes num_lmcache_hit_blocks:
+        # Why computed_blocks uses max(num_vllm_hit_blocks, num_lmcache_hit_blocks):
        #
-        # Include lmcache-hit blocks so that the upper bound
+        # Both values represent a prefix of blocks whose KV data is already
-        # matches num_stored_blocks (which already covers
+        # available (either from vLLM APC or from LMCache), so they must NOT
-        # them). Hit blocks are NOT re-stored.
+        # be summed (that would double-count the overlapping prefix).
-        computed_blocks = (
+        #
-            tracker.num_scheduled_tokens // vllm_block_size
+        # * num_lmcache_hit_blocks: LMCache-hit blocks are already counted in
-            + tracker.num_lmcache_hit_blocks
+        #   num_stored_blocks (set during lookup), so they must be included
+        #   here to keep the upper bound consistent.  They are NOT re-stored.
+        # * num_vllm_hit_blocks: LMCache stores in units of chunks (N blocks),
+        #   so num_lmcache_hit_blocks is rounded DOWN to the nearest chunk
+        #   boundary.  When vLLM APC hits more blocks than that rounded value
+        #   (e.g. APC=44 blocks, LMCache=32 blocks after chunk alignment),
+        #   using only num_lmcache_hit_blocks would set the upper bound too
+        #   low and silently skip the APC-hit blocks that fall between the
+        #   two values, causing under-storing.  Taking the max ensures we
+        #   always use the tighter (larger) of the two hit counts.
+        computed_blocks = tracker.num_scheduled_tokens // vllm_block_size + max(
+            tracker.num_vllm_hit_blocks, tracker.num_lmcache_hit_blocks
        )
        min_available_blocks = min(
            len(tracker.block_hashes),