Unverified Commit 6d3b92f0 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: remove --connector flag for vLLM backend (LLM-90) (#6450)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent c9ff6235
......@@ -11,4 +11,4 @@ python -m dynamo.frontend &
# run worker with KVBM enabled
# NOTE: remove --enforce-eager for production use
DYN_KVBM_CPU_CACHE_GB=20 \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector kvbm --enforce-eager
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' --enforce-eager
......@@ -25,7 +25,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager \
--connector kvbm \
--kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
......@@ -36,6 +36,6 @@ CUDA_VISIBLE_DEVICES=1 DYN_KVBM_CPU_CACHE_GB=2 \
python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager \
--connector kvbm \
--kv-transfer-config '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}' \
--gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -13,4 +13,4 @@ python -m dynamo.frontend &
# run worker with LMCache enabled (without PROMETHEUS_MULTIPROC_DIR set externally)
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
......@@ -24,5 +24,5 @@ python -m dynamo.frontend &
# run worker with LMCache enabled and PROMETHEUS_MULTIPROC_DIR explicitly set
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
......@@ -65,11 +65,10 @@ fi
# Start vLLM worker with vision model
# Multimodal data (images) are decoded in the backend worker using ImageLoader
# --enforce-eager: Quick deployment (remove for production)
# --connector none: No KV transfer needed for aggregated serving
# Extra args from command line come last to allow overrides
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --connector none $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"
# Wait for all background processes to complete
wait
......
......@@ -50,5 +50,4 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
--model "$MODEL" \
--omni \
--stage-configs-path "$STAGE_CONFIG" \
--connector none \
"${EXTRA_ARGS[@]}"
......@@ -38,7 +38,6 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm \
--model "$MODEL" \
--omni \
--connector none \
--output-modalities image \
--media-output-fs-url file:///tmp/dynamo_media \
"${EXTRA_ARGS[@]}"
......@@ -40,7 +40,6 @@ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm \
--model "$MODEL" \
--omni \
--connector none \
--output-modalities video \
--media-output-fs-url file:///tmp/dynamo_media \
"${EXTRA_ARGS[@]}"
\ No newline at end of file
......@@ -46,4 +46,4 @@ python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager
......@@ -28,7 +28,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
......@@ -37,5 +36,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -30,7 +30,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
......@@ -39,5 +38,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
\ No newline at end of file
......@@ -25,5 +25,4 @@ CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \
"num_speculative_tokens": 2,
"method": "eagle"
}' \
--connector none \
--gpu-memory-utilization 0.8
\ No newline at end of file
......@@ -10,7 +10,7 @@ python -m dynamo.frontend &
# run decode worker on GPU 0, without enabling KVBM
# NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager &
# run prefill worker on GPU 1 with KVBM enabled using 20GB of CPU cache
# NOTE: remove --enforce-eager for production use
......@@ -20,6 +20,6 @@ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--disaggregation-mode prefill \
--connector kvbm nixl \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -10,9 +10,9 @@ python -m dynamo.frontend --router-mode kv &
# run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --disaggregation-mode decode &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --disaggregation-mode decode &
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --enforce-eager --disaggregation-mode decode &
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
......@@ -23,7 +23,7 @@ CUDA_VISIBLE_DEVICES=2 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--disaggregation-mode prefill \
--connector kvbm nixl \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}' &
......@@ -35,6 +35,6 @@ CUDA_VISIBLE_DEVICES=3 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--disaggregation-mode prefill \
--connector kvbm nixl \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
......@@ -38,7 +38,7 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
--model $MODEL \
--enforce-eager \
--disaggregation-mode prefill \
--connector kvbm nixl \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
......@@ -49,5 +49,5 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
--model $MODEL \
--enforce-eager \
--disaggregation-mode prefill \
--connector kvbm nixl \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
......@@ -20,5 +20,5 @@ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--disaggregation-mode prefill \
--connector lmcache nixl \
--kv-transfer-config '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -32,7 +32,6 @@ HABANA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
......@@ -40,7 +39,6 @@ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--disaggregation-mode decode &
# two prefill workers
......@@ -51,7 +49,6 @@ HABANA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558", "enable_kv_cache_events":true}'&
......@@ -60,6 +57,5 @@ HABANA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}'
......@@ -24,7 +24,6 @@ python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager \
--connector none \
--enable-lora \
--max-lora-rank 64
......
......@@ -36,7 +36,6 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
......@@ -47,7 +46,6 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......
......@@ -83,8 +83,6 @@ spec:
- --enable-multimodal
- --model
- Qwen/Qwen3-VL-2B-Instruct
- --connector
- none
- --enable-lora
- --max-lora-rank
- "64"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment