Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6aa6ad89
Unverified
Commit
6aa6ad89
authored
Mar 04, 2026
by
Qi Wang
Committed by
GitHub
Mar 04, 2026
Browse files
[BugFix] Fix implicit and incorrect assumption on ECConnector is_producer (#34783)
Signed-off-by:
Qi Wang
<
qiwa@nvidia.com
>
parent
c8c3935b
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
125 additions
and
33 deletions
+125
-33
examples/online_serving/ec_both_encoder/ec_both_encoder.sh
examples/online_serving/ec_both_encoder/ec_both_encoder.sh
+73
-0
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+22
-9
tests/v1/ec_connector/unit/test_ec_example_connector.py
tests/v1/ec_connector/unit/test_ec_example_connector.py
+16
-16
vllm/distributed/ec_transfer/ec_connector/example_connector.py
...distributed/ec_transfer/ec_connector/example_connector.py
+3
-1
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+4
-0
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+4
-4
vllm/v1/executor/ray_executor.py
vllm/v1/executor/ray_executor.py
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-2
No files found.
examples/online_serving/ec_both_encoder/ec_both_encoder.sh
0 → 100755
View file @
6aa6ad89
#!/bin/bash
set
-euo
pipefail
MODEL
=
"
${
MODEL
:-
Qwen
/Qwen2.5-VL-3B-Instruct
}
"
PORT
=
"
${
PORT
:-
8000
}
"
GPU
=
"
${
GPU
:-
0
}
"
NUM_PROMPTS
=
"
${
NUM_PROMPTS
:-
200
}
"
EC_SHARED_STORAGE_PATH
=
"
${
EC_SHARED_STORAGE_PATH
:-
/tmp/ec_cache
}
"
TIMEOUT
=
"
${
TIMEOUT
:-
600
}
"
SERVER_PID
=
""
cleanup
()
{
echo
"Stopping server..."
if
[[
-n
"
$SERVER_PID
"
]]
&&
kill
-0
"
$SERVER_PID
"
2>/dev/null
;
then
kill
"
$SERVER_PID
"
2>/dev/null
||
true
wait
"
$SERVER_PID
"
2>/dev/null
||
true
fi
echo
"Done."
}
trap
cleanup EXIT INT TERM
wait_for_server
()
{
local
deadline
=
$((
SECONDS
+
TIMEOUT
))
echo
"Waiting for server on port
$PORT
..."
while
((
SECONDS < deadline
))
;
do
if
curl
-sf
"http://localhost:
${
PORT
}
/v1/models"
>
/dev/null 2>&1
;
then
echo
"Server ready."
return
0
fi
sleep
2
done
echo
"ERROR: Server did not start within
${
TIMEOUT
}
s"
return
1
}
rm
-rf
"
$EC_SHARED_STORAGE_PATH
"
mkdir
-p
"
$EC_SHARED_STORAGE_PATH
"
###############################################################################
# Start server with ec_both
###############################################################################
CUDA_VISIBLE_DEVICES
=
"
$GPU
"
\
vllm serve
"
$MODEL
"
\
--port
"
$PORT
"
\
--enforce-eager
\
--ec-transfer-config
'{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_both",
"ec_connector_extra_config": {
"shared_storage_path": "'
"
$EC_SHARED_STORAGE_PATH
"
'"
}
}'
\
"
$@
"
&
SERVER_PID
=
$!
wait_for_server
###############################################################################
# Benchmark -- dataset contains duplicate images, exercises cache hits
###############################################################################
echo
"Running benchmark (
$NUM_PROMPTS
prompts)..."
vllm bench serve
\
--model
"
$MODEL
"
\
--backend
openai-chat
\
--endpoint
/v1/chat/completions
\
--dataset-name
hf
\
--dataset-path
lmarena-ai/VisionArena-Chat
\
--seed
0
\
--num-prompts
"
$NUM_PROMPTS
"
\
--port
"
$PORT
"
echo
"Benchmark complete."
tests/v1/core/test_scheduler.py
View file @
6aa6ad89
...
@@ -3010,12 +3010,16 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
...
@@ -3010,12 +3010,16 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
# Encoder cache should contain all mm items from request2
# Encoder cache should contain all mm items from request2
_assert_right_encoder_cache_allocated
(
scheduler
,
requests
=
[
request2
])
_assert_right_encoder_cache_allocated
(
scheduler
,
requests
=
[
request2
])
# Should call update_state_after_alloc for hash1_C, ONLY
# hash1_A should not be loaded from connector
# hash1_A should not be loaded from connector
# since it's computed in last request & exist in local cache
# since it's computed in last request & exist in local cache
# Order of getting encoder cache should be: local cache -> connector-> compute
# Order of getting encoder cache should be: local cache -> connector-> compute
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called_with
(
request2
,
0
)
# update_state_after_alloc is called for all paths:
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called_once
()
# index 0 (hash1_C): connector hit → queued for load
# index 1 (hash1_D): cache miss → no-op inside connector
# index 2 (hash1_E): cache miss → no-op inside connector
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_any_call
(
request2
,
0
)
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_any_call
(
request2
,
1
)
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_any_call
(
request2
,
2
)
scheduler
.
ec_connector
.
update_state_after_alloc
.
reset_mock
()
scheduler
.
ec_connector
.
update_state_after_alloc
.
reset_mock
()
...
@@ -3087,7 +3091,6 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
...
@@ -3087,7 +3091,6 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
# mm_hashes of requests exist in cache after scheduling for all scenario
# mm_hashes of requests exist in cache after scheduling for all scenario
_assert_right_encoder_cache_allocated
(
scheduler
,
requests
=
requests
)
_assert_right_encoder_cache_allocated
(
scheduler
,
requests
=
requests
)
# Should only call update_state_after_alloc when loaded externally
if
cache_exist
==
"connector_only"
:
if
cache_exist
==
"connector_only"
:
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called_with
(
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called_with
(
requests
[
-
1
],
0
requests
[
-
1
],
0
...
@@ -3098,9 +3101,15 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
...
@@ -3098,9 +3101,15 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
# Check metadata should contain mm data for all 10 requests
# Check metadata should contain mm data for all 10 requests
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
mm_features_list
)
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
mm_features_list
)
else
:
elif
cache_exist
==
"local"
:
# Local cache hit: items never reach update_state_after_alloc
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_not_called
()
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_not_called
()
# ECConnector should carry no metadata
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
[])
else
:
# no_where: called from encoder_inputs_to_schedule but no-op
# inside connector (has_cache_item returns False)
assert
cache_exist
==
"no_where"
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called
()
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
[])
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
[])
scheduler
.
ec_connector
.
update_state_after_alloc
.
reset_mock
()
scheduler
.
ec_connector
.
update_state_after_alloc
.
reset_mock
()
...
@@ -3419,7 +3428,6 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
...
@@ -3419,7 +3428,6 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
# mm_hash of request_low exists in cache after scheduling for all scenario
# mm_hash of request_low exists in cache after scheduling for all scenario
_assert_right_encoder_cache_allocated
(
scheduler
,
requests
=
[
request_low
])
_assert_right_encoder_cache_allocated
(
scheduler
,
requests
=
[
request_low
])
# Should only call update_state_after_alloc when loaded externally
if
cache_exist
==
"connector_only"
:
if
cache_exist
==
"connector_only"
:
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called_with
(
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called_with
(
request_low
,
0
request_low
,
0
...
@@ -3427,9 +3435,14 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
...
@@ -3427,9 +3435,14 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
_assert_right_ec_connector_metadata
(
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
request_low
.
mm_features
output
,
mm_features_list
=
request_low
.
mm_features
)
)
el
se
:
el
if
cache_exist
==
"local"
:
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_not_called
()
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_not_called
()
# ECConnector should carry no metadata
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
[])
else
:
assert
cache_exist
==
"no_where"
scheduler
.
ec_connector
.
update_state_after_alloc
.
assert_called_with
(
request_low
,
0
)
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
[])
_assert_right_ec_connector_metadata
(
output
,
mm_features_list
=
[])
scheduler
.
ec_connector
.
update_state_after_alloc
.
reset_mock
()
scheduler
.
ec_connector
.
update_state_after_alloc
.
reset_mock
()
...
...
tests/v1/ec_connector/unit/test_ec_example_connector.py
View file @
6aa6ad89
...
@@ -233,9 +233,10 @@ class TestStateManagement:
...
@@ -233,9 +233,10 @@ class TestStateManagement:
# Initial state should be empty
# Initial state should be empty
assert
len
(
connector
.
_mm_datas_need_loads
)
==
0
assert
len
(
connector
.
_mm_datas_need_loads
)
==
0
# Update state for all 3 items
# Update state for all 3 items (mock cache existence)
for
i
in
range
(
3
):
with
patch
.
object
(
connector
,
"has_cache_item"
,
return_value
=
True
):
connector
.
update_state_after_alloc
(
mock_request_with_3_mm
,
index
=
i
)
for
i
in
range
(
3
):
connector
.
update_state_after_alloc
(
mock_request_with_3_mm
,
index
=
i
)
# Check state updated for all 3
# Check state updated for all 3
assert
len
(
connector
.
_mm_datas_need_loads
)
==
3
assert
len
(
connector
.
_mm_datas_need_loads
)
==
3
...
@@ -255,9 +256,10 @@ class TestStateManagement:
...
@@ -255,9 +256,10 @@ class TestStateManagement:
role
=
ECConnectorRole
.
SCHEDULER
,
role
=
ECConnectorRole
.
SCHEDULER
,
)
)
# Setup state for all 3 items
# Setup state for all 3 items (mock cache existence)
for
i
in
range
(
3
):
with
patch
.
object
(
connector
,
"has_cache_item"
,
return_value
=
True
):
connector
.
update_state_after_alloc
(
mock_request_with_3_mm
,
index
=
i
)
for
i
in
range
(
3
):
connector
.
update_state_after_alloc
(
mock_request_with_3_mm
,
index
=
i
)
# Build metadata
# Build metadata
scheduler_output
=
Mock
(
spec
=
SchedulerOutput
)
scheduler_output
=
Mock
(
spec
=
SchedulerOutput
)
...
@@ -298,9 +300,10 @@ class TestStateManagement:
...
@@ -298,9 +300,10 @@ class TestStateManagement:
role
=
ECConnectorRole
.
SCHEDULER
,
role
=
ECConnectorRole
.
SCHEDULER
,
)
)
# Add state
# Add state (mock cache existence)
for
i
in
range
(
3
):
with
patch
.
object
(
connector
,
"has_cache_item"
,
return_value
=
True
):
connector
.
update_state_after_alloc
(
mock_request_with_3_mm
,
index
=
i
)
for
i
in
range
(
3
):
connector
.
update_state_after_alloc
(
mock_request_with_3_mm
,
index
=
i
)
assert
len
(
connector
.
_mm_datas_need_loads
)
==
3
assert
len
(
connector
.
_mm_datas_need_loads
)
==
3
# Build metadata (should clear state)
# Build metadata (should clear state)
...
@@ -608,16 +611,13 @@ class TestEdgeCases:
...
@@ -608,16 +611,13 @@ class TestEdgeCases:
with
pytest
.
raises
(
FileNotFoundError
):
with
pytest
.
raises
(
FileNotFoundError
):
connector
.
start_load_caches
(
encoder_cache
=
encoder_cache
)
connector
.
start_load_caches
(
encoder_cache
=
encoder_cache
)
def
test_has_cache
s
_empty_request
(
self
,
mock_vllm_config_producer
):
def
test_has_cache
_item
_empty_request
(
self
,
mock_vllm_config_producer
):
"""Test has_cache
s with request that has no MM data
."""
"""Test has_cache
_item with a nonexistent identifier
."""
connector
=
ECExampleConnector
(
connector
=
ECExampleConnector
(
vllm_config
=
mock_vllm_config_producer
,
vllm_config
=
mock_vllm_config_producer
,
role
=
ECConnectorRole
.
SCHEDULER
,
role
=
ECConnectorRole
.
SCHEDULER
,
)
)
mock_request
=
MockRequest
(
"req_empty"
,
[],
[]
)
result
=
connector
.
has_cache_item
(
"nonexistent_hash"
)
result
=
connector
.
has_caches
(
mock_request
)
assert
result
is
False
assert
len
(
result
)
==
0
assert
result
==
[]
vllm/distributed/ec_transfer/ec_connector/example_connector.py
View file @
6aa6ad89
...
@@ -141,8 +141,10 @@ class ECExampleConnector(ECConnectorBase):
...
@@ -141,8 +141,10 @@ class ECExampleConnector(ECConnectorBase):
Update ECConnector state after encoder cache allocation.
Update ECConnector state after encoder cache allocation.
"""
"""
mm_hash
=
request
.
mm_features
[
index
].
identifier
mm_hash
=
request
.
mm_features
[
index
].
identifier
# Only load cache if it is consumer and cache exists
if
not
self
.
is_consumer
or
not
self
.
has_cache_item
(
mm_hash
):
return
num_encoder_token
=
request
.
get_num_encoder_embeds
(
index
)
num_encoder_token
=
request
.
get_num_encoder_embeds
(
index
)
# Insert mm_hash only if this block has not been recorded yet.
self
.
_mm_datas_need_loads
[
mm_hash
]
=
num_encoder_token
self
.
_mm_datas_need_loads
[
mm_hash
]
=
num_encoder_token
def
build_connector_meta
(
def
build_connector_meta
(
...
...
vllm/v1/core/sched/scheduler.py
View file @
6aa6ad89
...
@@ -515,6 +515,8 @@ class Scheduler(SchedulerInterface):
...
@@ -515,6 +515,8 @@ class Scheduler(SchedulerInterface):
# Allocate the encoder cache.
# Allocate the encoder cache.
for
i
in
encoder_inputs_to_schedule
:
for
i
in
encoder_inputs_to_schedule
:
self
.
encoder_cache_manager
.
allocate
(
request
,
i
)
self
.
encoder_cache_manager
.
allocate
(
request
,
i
)
if
self
.
ec_connector
is
not
None
:
self
.
ec_connector
.
update_state_after_alloc
(
request
,
i
)
encoder_compute_budget
=
new_encoder_compute_budget
encoder_compute_budget
=
new_encoder_compute_budget
if
external_load_encoder_input
:
if
external_load_encoder_input
:
for
i
in
external_load_encoder_input
:
for
i
in
external_load_encoder_input
:
...
@@ -803,6 +805,8 @@ class Scheduler(SchedulerInterface):
...
@@ -803,6 +805,8 @@ class Scheduler(SchedulerInterface):
# Allocate the encoder cache.
# Allocate the encoder cache.
for
i
in
encoder_inputs_to_schedule
:
for
i
in
encoder_inputs_to_schedule
:
self
.
encoder_cache_manager
.
allocate
(
request
,
i
)
self
.
encoder_cache_manager
.
allocate
(
request
,
i
)
if
self
.
ec_connector
is
not
None
:
self
.
ec_connector
.
update_state_after_alloc
(
request
,
i
)
encoder_compute_budget
=
new_encoder_compute_budget
encoder_compute_budget
=
new_encoder_compute_budget
# Allocate for external load encoder cache
# Allocate for external load encoder cache
if
external_load_encoder_input
:
if
external_load_encoder_input
:
...
...
vllm/v1/engine/core.py
View file @
6aa6ad89
...
@@ -193,9 +193,9 @@ class EngineCore:
...
@@ -193,9 +193,9 @@ class EngineCore:
logger
.
debug
(
"Batch queue is enabled with size %d"
,
self
.
batch_queue_size
)
logger
.
debug
(
"Batch queue is enabled with size %d"
,
self
.
batch_queue_size
)
self
.
batch_queue
=
deque
(
maxlen
=
self
.
batch_queue_size
)
self
.
batch_queue
=
deque
(
maxlen
=
self
.
batch_queue_size
)
self
.
is_ec_
produc
er
=
(
self
.
is_ec_
consum
er
=
(
vllm_config
.
ec_transfer_config
is
not
None
vllm_config
.
ec_transfer_config
is
None
and
vllm_config
.
ec_transfer_config
.
is_ec_
produc
er
or
vllm_config
.
ec_transfer_config
.
is_ec_
consum
er
)
)
self
.
is_pooling_model
=
vllm_config
.
model_config
.
runner_type
==
"pooling"
self
.
is_pooling_model
=
vllm_config
.
model_config
.
runner_type
==
"pooling"
...
@@ -449,7 +449,7 @@ class EngineCore:
...
@@ -449,7 +449,7 @@ class EngineCore:
exec_future
=
self
.
model_executor
.
execute_model
(
exec_future
=
self
.
model_executor
.
execute_model
(
scheduler_output
,
non_block
=
True
scheduler_output
,
non_block
=
True
)
)
if
not
self
.
is_ec_
produc
er
:
if
self
.
is_ec_
consum
er
:
model_executed
=
scheduler_output
.
total_num_scheduled_tokens
>
0
model_executed
=
scheduler_output
.
total_num_scheduled_tokens
>
0
if
self
.
is_pooling_model
or
not
model_executed
:
if
self
.
is_pooling_model
or
not
model_executed
:
...
...
vllm/v1/executor/ray_executor.py
View file @
6aa6ad89
...
@@ -100,7 +100,7 @@ class RayDistributedExecutor(Executor):
...
@@ -100,7 +100,7 @@ class RayDistributedExecutor(Executor):
self
.
uses_sampler
=
self
.
vllm_config
.
model_config
.
runner_type
!=
"pooling"
and
(
self
.
uses_sampler
=
self
.
vllm_config
.
model_config
.
runner_type
!=
"pooling"
and
(
self
.
vllm_config
.
ec_transfer_config
is
None
self
.
vllm_config
.
ec_transfer_config
is
None
or
not
self
.
vllm_config
.
ec_transfer_config
.
is_ec_
produc
er
or
self
.
vllm_config
.
ec_transfer_config
.
is_ec_
consum
er
)
)
self
.
scheduler_output
:
SchedulerOutput
|
None
=
None
self
.
scheduler_output
:
SchedulerOutput
|
None
=
None
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
6aa6ad89
...
@@ -3409,7 +3409,7 @@ class GPUModelRunner(
...
@@ -3409,7 +3409,7 @@ class GPUModelRunner(
# Update persistent batch states.
# Update persistent batch states.
self
.
_update_states
(
scheduler_output
)
self
.
_update_states
(
scheduler_output
)
if
has_ec_transfer
()
and
get_ec_transfer
().
is_
produc
er
:
if
has_ec_transfer
()
and
not
get_ec_transfer
().
is_
consum
er
:
with
self
.
maybe_get_ec_connector_output
(
with
self
.
maybe_get_ec_connector_output
(
scheduler_output
,
scheduler_output
,
encoder_cache
=
self
.
encoder_cache
,
encoder_cache
=
self
.
encoder_cache
,
...
@@ -6182,7 +6182,7 @@ class GPUModelRunner(
...
@@ -6182,7 +6182,7 @@ class GPUModelRunner(
KVCacheSpec: A dictionary mapping layer names to their KV cache
KVCacheSpec: A dictionary mapping layer names to their KV cache
format. Layers that do not need KV cache are not included.
format. Layers that do not need KV cache are not included.
"""
"""
if
has_ec_transfer
()
and
get_ec_transfer
().
is_
produc
er
:
if
has_ec_transfer
()
and
not
get_ec_transfer
().
is_
consum
er
:
return
{}
return
{}
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
]
=
{}
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
]
=
{}
layer_type
=
cast
(
type
[
Any
],
AttentionLayerBase
)
layer_type
=
cast
(
type
[
Any
],
AttentionLayerBase
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment