Unverified Commit 6d3b92f0 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: remove --connector flag for vLLM backend (LLM-90) (#6450)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent c9ff6235
...@@ -140,13 +140,11 @@ echo "Starting vLLM worker..." ...@@ -140,13 +140,11 @@ echo "Starting vLLM worker..."
# --enable-lora: Enable LoRA adapter support in vLLM engine # --enable-lora: Enable LoRA adapter support in vLLM engine
# --max-lora-rank: Maximum LoRA rank (increase if your adapters have higher rank) # --max-lora-rank: Maximum LoRA rank (increase if your adapters have higher rank)
# --connector none: No KV transfer needed for aggregated serving
CUDA_VISIBLE_DEVICES="$GPU_DEVICE" \ CUDA_VISIBLE_DEVICES="$GPU_DEVICE" \
DYN_SYSTEM_PORT="$SYSTEM_PORT" \ DYN_SYSTEM_PORT="$SYSTEM_PORT" \
python -m dynamo.vllm \ python -m dynamo.vllm \
--enable-multimodal \ --enable-multimodal \
--model "$MODEL_NAME" \ --model "$MODEL_NAME" \
--connector none \
--enable-lora \ --enable-lora \
--max-lora-rank "$MAX_LORA_RANK" \ --max-lora-rank "$MAX_LORA_RANK" \
"${MODEL_SPECIFIC_ARGS[@]}" \ "${MODEL_SPECIFIC_ARGS[@]}" \
......
...@@ -73,8 +73,6 @@ class VllmPromptEmbedsWorkerProcess(ManagedProcess): ...@@ -73,8 +73,6 @@ class VllmPromptEmbedsWorkerProcess(ManagedProcess):
"dynamo.vllm", "dynamo.vllm",
"--model", "--model",
TEST_MODEL, TEST_MODEL,
"--connector",
"none",
"--max-model-len", "--max-model-len",
"4096", "4096",
"--discovery-backend", "--discovery-backend",
......
...@@ -93,8 +93,6 @@ class VllmWorkerProcess(ManagedProcess): ...@@ -93,8 +93,6 @@ class VllmWorkerProcess(ManagedProcess):
"harmony", "harmony",
"--dyn-reasoning-parser", "--dyn-reasoning-parser",
"gpt_oss", "gpt_oss",
"--connector",
"none",
] ]
env = os.environ.copy() env = os.environ.copy()
......
...@@ -354,8 +354,8 @@ def llm_worker(frontend_server, test_directory, runtime_services, engine_type): ...@@ -354,8 +354,8 @@ def llm_worker(frontend_server, test_directory, runtime_services, engine_type):
"dynamo.vllm", "dynamo.vllm",
"--model", "--model",
model_id, model_id,
"--connector", "--kv-transfer-config",
"kvbm", '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}',
"--enforce-eager", # For faster startup in tests "--enforce-eager", # For faster startup in tests
] ]
else: # trtllm else: # trtllm
...@@ -777,8 +777,8 @@ class TestConsolidatorRouterE2E: ...@@ -777,8 +777,8 @@ class TestConsolidatorRouterE2E:
"dynamo.vllm", "dynamo.vllm",
"--model", "--model",
model_id, model_id,
"--connector", "--kv-transfer-config",
"kvbm", '{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"}',
"--enforce-eager", "--enforce-eager",
"--enable-prefix-caching", "--enable-prefix-caching",
"--num-gpu-blocks-override", "--num-gpu-blocks-override",
......
...@@ -143,8 +143,8 @@ class LLMServerManager: ...@@ -143,8 +143,8 @@ class LLMServerManager:
"16", "16",
"--max-model-len", "--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model "8000", # required to fit on L4 GPU when using 8b model
"--connector", "--kv-transfer-config",
"nixl", '{"kv_connector":"NixlConnector","kv_role":"kv_both"}',
] ]
# Construct prefiller command # Construct prefiller command
...@@ -160,9 +160,8 @@ class LLMServerManager: ...@@ -160,9 +160,8 @@ class LLMServerManager:
"16", "16",
"--max-model-len", "--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model "8000", # required to fit on L4 GPU when using 8b model
"--connector", "--kv-transfer-config",
"kvbm", '{"kv_connector":"PdConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"DynamoConnector","kv_connector_module_path":"kvbm.vllm_integration.connector","kv_role":"kv_both"},{"kv_connector":"NixlConnector","kv_role":"kv_both"}]},"kv_connector_module_path":"kvbm.vllm_integration.connector"}',
"nixl",
] ]
# GPU blocks override # GPU blocks override
......
...@@ -27,4 +27,4 @@ echo "🔧 Starting dynamo worker with LMCache enabled..." ...@@ -27,4 +27,4 @@ echo "🔧 Starting dynamo worker with LMCache enabled..."
python -m dynamo.frontend & python -m dynamo.frontend &
python3 -m dynamo.vllm --model $MODEL_URL --connector lmcache python3 -m dynamo.vllm --model $MODEL_URL --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
...@@ -109,8 +109,6 @@ class VLLMWorkerProcess(ManagedProcess): ...@@ -109,8 +109,6 @@ class VLLMWorkerProcess(ManagedProcess):
"0.85", "0.85",
"--max-model-len", "--max-model-len",
"8192", "8192",
"--connector",
"none",
"--served-model-name", "--served-model-name",
f"{VLLM_MM_MODEL}__internal", f"{VLLM_MM_MODEL}__internal",
], ],
......
...@@ -148,7 +148,7 @@ class VLLMProcess: ...@@ -148,7 +148,7 @@ class VLLMProcess:
# - When data_parallel_size is set, launch one process per DP rank # - When data_parallel_size is set, launch one process per DP rank
# - Each process gets --data-parallel-rank and --data-parallel-size # - Each process gets --data-parallel-rank and --data-parallel-size
# - Each process runs on its own GPU via CUDA_VISIBLE_DEVICES # - Each process runs on its own GPU via CUDA_VISIBLE_DEVICES
# - --connector nixl enables KV cache transfer between ranks # - --kv-transfer-config enables KV cache transfer between ranks
for worker_idx in range(num_workers): for worker_idx in range(num_workers):
# Calculate GPU device for this process # Calculate GPU device for this process
...@@ -207,7 +207,7 @@ class VLLMProcess: ...@@ -207,7 +207,7 @@ class VLLMProcess:
str(data_parallel_size), str(data_parallel_size),
# "--data-parallel-address", "127.0.0.1", # Required for DP coordination # "--data-parallel-address", "127.0.0.1", # Required for DP coordination
# "--data-parallel-rpc-port", "13345", # RPC port for DP coordination # "--data-parallel-rpc-port", "13345", # RPC port for DP coordination
# "--connector", "nixl", # Required for KV transfer between DP ranks # "--kv-transfer-config", '{"kv_connector":"NixlConnector","kv_role":"kv_both"}', # Required for KV transfer between DP ranks
] ]
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment