Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5d6aae45
Unverified
Commit
5d6aae45
authored
Mar 07, 2026
by
Samuel Shen
Committed by
GitHub
Mar 07, 2026
Browse files
[LMCache MP Patch]: Race Condition + Duplicated Block Ids (#35831)
parent
63298ee1
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
4 deletions
+23
-4
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
...buted/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+23
-4
No files found.
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
View file @
5d6aae45
...
@@ -336,11 +336,21 @@ class LMCacheMPRequestMetadata:
...
@@ -336,11 +336,21 @@ class LMCacheMPRequestMetadata:
start_token_idx
=
start
*
vllm_block_size
start_token_idx
=
start
*
vllm_block_size
end_token_idx
=
end
*
vllm_block_size
end_token_idx
=
end
*
vllm_block_size
token_ids
=
list
(
tracker
.
all_token_ids
)
token_ids
=
list
(
tracker
.
all_token_ids
)
# Compute how many tokens at the start of the retrieve range
# overlap with APC-shared blocks. The server must skip writing
# to these positions to avoid a cross-stream data race: the
# retrieve writes on the LMCache CUDA stream while concurrent
# requests may read these APC-shared blocks on the vLLM stream.
apc_overlap_blocks
=
tracker
.
num_vllm_hit_blocks
-
start
skip_first_n_tokens
=
apc_overlap_blocks
*
vllm_block_size
op
=
LoadStoreOp
(
op
=
LoadStoreOp
(
token_ids
=
token_ids
,
token_ids
=
token_ids
,
block_ids
=
block_ids
,
block_ids
=
block_ids
,
start
=
start_token_idx
,
start
=
start_token_idx
,
end
=
end_token_idx
,
end
=
end_token_idx
,
skip_first_n_tokens
=
skip_first_n_tokens
,
)
)
ret
=
LMCacheMPRequestMetadata
(
ret
=
LMCacheMPRequestMetadata
(
...
@@ -700,13 +710,22 @@ class LMCacheMPConnector(KVConnectorBase_V1):
...
@@ -700,13 +710,22 @@ class LMCacheMPConnector(KVConnectorBase_V1):
num_external_tokens (int): the number of tokens that will be
num_external_tokens (int): the number of tokens that will be
loaded from the external KV cache.
loaded from the external KV cache.
"""
"""
# NOTE: the `blocks` are NEW BLOCKS allocated for this request.
# NOTE: `blocks` comes from kv_cache_manager.get_blocks(request_id),
# which returns ALL blocks for the request (not just newly allocated).
# This function may be called twice for async-load requests:
# 1st call: blocks = initial allocation (APC + fresh)
# 2nd call: blocks = all blocks
# (initial + newly allocated for remaining tokens)
# We must only append the NEW blocks beyond what's already tracked
# to avoid duplication, which would corrupt the store path's block indexing.
tracker
=
self
.
_get_request_tracker
(
request
.
request_id
)
tracker
=
self
.
_get_request_tracker
(
request
.
request_id
)
block_ids
=
reformat_block_ids
(
blocks
.
get_block_ids
())
block_ids
=
reformat_block_ids
(
blocks
.
get_block_ids
())
# No matter we need to retrieve or not, we need to update
# Only append blocks beyond what's already tracked
# the block ids into the tracker
existing_count
=
len
(
tracker
.
allocated_block_ids
)
tracker
.
append_block_ids
(
block_ids
)
new_block_ids
=
block_ids
[
existing_count
:]
if
new_block_ids
:
tracker
.
append_block_ids
(
new_block_ids
)
# Update the state of the tracker
# Update the state of the tracker
condition
=
tracker
.
needs_retrieve
()
condition
=
tracker
.
needs_retrieve
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment