Unverified Commit 7b967c77 authored by Karen Chung's avatar Karen Chung Committed by GitHub
Browse files

fix: Mark enable_kv_events=True for user-specified vLLM KVEventsConfig in scripts (#4418)


Signed-off-by: default avatarKaren Chung <karenc@nvidia.com>
parent 2f1778c1
...@@ -357,6 +357,12 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]: ...@@ -357,6 +357,12 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
# If user provided their own config, use that # If user provided their own config, use that
if c := getattr(config.engine_args, "kv_events_config"): if c := getattr(config.engine_args, "kv_events_config"):
# Warn user that enable_kv_cache_events probably should be True (user may have omitted it from JSON)
if not c.enable_kv_cache_events:
logger.warning(
"User provided --kv_events_config which set enable_kv_cache_events to False (default). "
"To publish events, explicitly set enable_kv_cache_events to True."
)
logger.info(f"Using user-provided kv_events_config {c}") logger.info(f"Using user-provided kv_events_config {c}")
return c return c
......
...@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \ ...@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager \ --enforce-eager \
--connector kvbm \ --connector kvbm \
--gpu-memory-utilization 0.4 \ --gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
...@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \ ...@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager \ --enforce-eager \
--connector kvbm \ --connector kvbm \
--gpu-memory-utilization 0.4 \ --gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--connector none \ --connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
...@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ ...@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--connector none \ --connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}'& --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# two prefill workers # two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected # When registered with --is-prefill-worker, these workers are automatically detected
...@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \ ...@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --is-prefill-worker \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'& --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
...@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \ ...@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --is-prefill-worker \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment