Unverified Commit 6d3b92f0 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: remove --connector flag for vLLM backend (LLM-90) (#6450)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent c9ff6235
......@@ -140,13 +140,11 @@ echo "Starting vLLM worker..."
# --enable-lora: Enable LoRA adapter support in vLLM engine
# --max-lora-rank: Maximum LoRA rank (increase if your adapters have higher rank)
# --connector none: No KV transfer needed for aggregated serving
CUDA_VISIBLE_DEVICES="$GPU_DEVICE" \
DYN_SYSTEM_PORT="$SYSTEM_PORT" \
python -m dynamo.vllm \
--enable-multimodal \
--model "$MODEL_NAME" \
--connector none \
--enable-lora \
--max-lora-rank "$MAX_LORA_RANK" \
"${MODEL_SPECIFIC_ARGS[@]}" \
......
......@@ -73,8 +73,6 @@ class VllmPromptEmbedsWorkerProcess(ManagedProcess):
"dynamo.vllm",
"--model",
TEST_MODEL,
"--connector",
"none",
"--max-model-len",
"4096",
"--discovery-backend",
......
......@@ -93,8 +93,6 @@ class VllmWorkerProcess(ManagedProcess):
"harmony",
"--dyn-reasoning-parser",
"gpt_oss",
"--connector",
"none",
]
env = os.environ.copy()
......
......@@ -354,8 +354,8 @@ def llm_worker(frontend_server, test_directory, runtime_services, engine_type):
"dynamo.vllm",
"--model",
model_id,
"--connector",
"kvbm",
"--kv-transfer-config",
'{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}',
"--enforce-eager", # For faster startup in tests
]
else: # trtllm
......@@ -777,8 +777,8 @@ class TestConsolidatorRouterE2E:
"dynamo.vllm",
"--model",
model_id,
"--connector",
"kvbm",
"--kv-transfer-config",
'{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}',
"--enforce-eager",
"--enable-prefix-caching",
"--num-gpu-blocks-override",
......
......@@ -143,8 +143,8 @@ class LLMServerManager:
"16",
"--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model
"--connector",
"nixl",
"--kv-transfer-config",
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
]
# Construct prefiller command
......@@ -160,9 +160,8 @@ class LLMServerManager:
"16",
"--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model
"--connector",
"kvbm",
"nixl",
"--kv-transfer-config",
'{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}',
]
# GPU blocks override
......
......@@ -27,4 +27,4 @@ echo "🔧 Starting dynamo worker with LMCache enabled..."
python -m dynamo.frontend &
python3 -m dynamo.vllm --model $MODEL_URL --connector lmcache
python3 -m dynamo.vllm --model $MODEL_URL --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
......@@ -109,8 +109,6 @@ class VLLMWorkerProcess(ManagedProcess):
"0.85",
"--max-model-len",
"8192",
"--connector",
"none",
"--served-model-name",
f"{VLLM_MM_MODEL}__internal",
],
......
......@@ -148,7 +148,7 @@ class VLLMProcess:
# - When data_parallel_size is set, launch one process per DP rank
# - Each process gets --data-parallel-rank and --data-parallel-size
# - Each process runs on its own GPU via CUDA_VISIBLE_DEVICES
# - --connector nixl enables KV cache transfer between ranks
# - --kv-transfer-config enables KV cache transfer between ranks
for worker_idx in range(num_workers):
# Calculate GPU device for this process
......@@ -207,7 +207,7 @@ class VLLMProcess:
str(data_parallel_size),
# "--data-parallel-address", "127.0.0.1", # Required for DP coordination
# "--data-parallel-rpc-port", "13345", # RPC port for DP coordination
# "--connector", "nixl", # Required for KV transfer between DP ranks
# "--kv-transfer-config", '{"kv_connector":"NixlConnector","kv_role":"kv_both"}', # Required for KV transfer between DP ranks
]
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment