Unverified Commit 7893f268 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: add --disaggregation-mode enum to vLLM backend (#6483)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 6d3e0137
......@@ -42,7 +42,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disaggregation-mode prefill --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
......
......@@ -54,7 +54,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disaggregation-mode prefill --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
......
......@@ -43,6 +43,8 @@ spec:
args:
- --model
- Qwen/Qwen3-32B
- --disaggregation-mode
- decode
- --tensor-parallel-size
- '2'
- --disable-log-requests
......@@ -95,7 +97,8 @@ spec:
args:
- --model
- Qwen/Qwen3-32B
- --is-prefill-worker
- --disaggregation-mode
- prefill
- --tensor-parallel-size
- '2'
- --disable-log-requests
......
......@@ -70,7 +70,7 @@ class DynamoWorkerProcess(ManagedProcess):
# Configure health check based on worker type
if is_prefill:
# Prefill workers check their own status endpoint
command.append("--is-prefill-worker")
command.extend(["--disaggregation-mode", "prefill"])
health_check_urls = [
(f"http://localhost:{system_port}/health", self.is_ready)
]
......
......@@ -126,7 +126,8 @@ spec:
- --model
- deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code
- --is-prefill-worker
- --disaggregation-mode
- prefill
- --tensor-parallel-size
- "1"
- --data-parallel-size
......
......@@ -51,7 +51,7 @@ class DynamoWorkerProcess(ManagedProcess):
# Configure health check based on worker type
if is_prefill:
# Prefill workers check their own status endpoint
command.append("--is-prefill-worker")
command.extend(["--disaggregation-mode", "prefill"])
health_check_urls = [(f"http://localhost:{port}/health", self.is_ready)]
else:
# Decode workers should also check their own status endpoint first,
......
......@@ -104,9 +104,9 @@ class DynamoWorkerProcess(ManagedProcess):
"0.15", # avoid assertion error on vLLM available memory checks
]
if is_prefill is True:
command.append("--is-prefill-worker")
command.extend(["--disaggregation-mode", "prefill"])
elif is_prefill is False:
command.append("--is-decode-worker")
command.extend(["--disaggregation-mode", "decode"])
# Aggregated mode and prefill workers publish KV events
if is_prefill is not False:
......
......@@ -154,7 +154,8 @@ class LLMServerManager:
"dynamo.vllm",
"--model",
os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
"--is-prefill-worker",
"--disaggregation-mode",
"prefill",
"--block-size",
"16",
"--max-model-len",
......
......@@ -28,8 +28,8 @@ echo "🔧 Starting dynamo disaggregated serving without LMCache..."
python -m dynamo.frontend &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL&
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL --disaggregation-mode decode &
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL_URL \
--is-prefill-worker
--disaggregation-mode prefill
......@@ -29,7 +29,7 @@ echo "🔧 Starting dynamo disaggregated serving with LMCache enabled..."
python -m dynamo.frontend &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL&
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL --disaggregation-mode decode &
sleep 20
......@@ -40,4 +40,4 @@ LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \
--model $MODEL_URL \
--is-prefill-worker
--disaggregation-mode prefill
......@@ -90,7 +90,7 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --disaggregation-mode decode 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
......@@ -137,4 +137,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
......@@ -137,4 +137,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
......@@ -196,7 +196,8 @@ spec:
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker
- --disaggregation-mode
- prefill
- --no-enable-prefix-caching
- --block-size
- "128"
......@@ -90,7 +90,7 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 --disaggregation-mode decode 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
......@@ -137,4 +137,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log"
......@@ -137,4 +137,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill 2>&1 | tee /tmp/vllm.log"
......@@ -77,4 +77,5 @@ spec:
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker
- --disaggregation-mode
- prefill
......@@ -49,6 +49,8 @@ spec:
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --disaggregation-mode
- decode
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
......@@ -68,4 +70,5 @@ spec:
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker
- --disaggregation-mode
- prefill
......@@ -125,9 +125,9 @@ def _build_mocker_command(
# Add worker type flag for disaggregated mode
if worker_type == "prefill":
command.append("--is-prefill-worker")
command.extend(["--disaggregation-mode", "prefill"])
elif worker_type == "decode":
command.append("--is-decode-worker")
command.extend(["--disaggregation-mode", "decode"])
# Add individual CLI arguments from mocker_args
if "speedup_ratio" in mocker_args:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment