Unverified Commit 7893f268 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: add --disaggregation-mode enum to vLLM backend (#6483)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 6d3e0137
...@@ -42,7 +42,7 @@ spec: ...@@ -42,7 +42,7 @@ spec:
- name: HF_HOME - name: HF_HOME
value: /opt/models value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disaggregation-mode prefill --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
......
...@@ -54,7 +54,7 @@ spec: ...@@ -54,7 +54,7 @@ spec:
- name: HF_HOME - name: HF_HOME
value: /opt/models value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disaggregation-mode prefill --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
......
...@@ -43,6 +43,8 @@ spec: ...@@ -43,6 +43,8 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-32B - Qwen/Qwen3-32B
- --disaggregation-mode
- decode
- --tensor-parallel-size - --tensor-parallel-size
- '2' - '2'
- --disable-log-requests - --disable-log-requests
...@@ -95,7 +97,8 @@ spec: ...@@ -95,7 +97,8 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-32B - Qwen/Qwen3-32B
- --is-prefill-worker - --disaggregation-mode
- prefill
- --tensor-parallel-size - --tensor-parallel-size
- '2' - '2'
- --disable-log-requests - --disable-log-requests
......
...@@ -70,7 +70,7 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -70,7 +70,7 @@ class DynamoWorkerProcess(ManagedProcess):
# Configure health check based on worker type # Configure health check based on worker type
if is_prefill: if is_prefill:
# Prefill workers check their own status endpoint # Prefill workers check their own status endpoint
command.append("--is-prefill-worker") command.extend(["--disaggregation-mode", "prefill"])
health_check_urls = [ health_check_urls = [
(f"http://localhost:{system_port}/health", self.is_ready) (f"http://localhost:{system_port}/health", self.is_ready)
] ]
......
...@@ -126,7 +126,8 @@ spec: ...@@ -126,7 +126,8 @@ spec:
- --model - --model
- deepseek-ai/DeepSeek-V2-Lite - deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code - --trust-remote-code
- --is-prefill-worker - --disaggregation-mode
- prefill
- --tensor-parallel-size - --tensor-parallel-size
- "1" - "1"
- --data-parallel-size - --data-parallel-size
......
...@@ -51,7 +51,7 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -51,7 +51,7 @@ class DynamoWorkerProcess(ManagedProcess):
# Configure health check based on worker type # Configure health check based on worker type
if is_prefill: if is_prefill:
# Prefill workers check their own status endpoint # Prefill workers check their own status endpoint
command.append("--is-prefill-worker") command.extend(["--disaggregation-mode", "prefill"])
health_check_urls = [(f"http://localhost:{port}/health", self.is_ready)] health_check_urls = [(f"http://localhost:{port}/health", self.is_ready)]
else: else:
# Decode workers should also check their own status endpoint first, # Decode workers should also check their own status endpoint first,
......
...@@ -104,9 +104,9 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -104,9 +104,9 @@ class DynamoWorkerProcess(ManagedProcess):
"0.15", # avoid assertion error on vLLM available memory checks "0.15", # avoid assertion error on vLLM available memory checks
] ]
if is_prefill is True: if is_prefill is True:
command.append("--is-prefill-worker") command.extend(["--disaggregation-mode", "prefill"])
elif is_prefill is False: elif is_prefill is False:
command.append("--is-decode-worker") command.extend(["--disaggregation-mode", "decode"])
# Aggregated mode and prefill workers publish KV events # Aggregated mode and prefill workers publish KV events
if is_prefill is not False: if is_prefill is not False:
......
...@@ -154,7 +154,8 @@ class LLMServerManager: ...@@ -154,7 +154,8 @@ class LLMServerManager:
"dynamo.vllm", "dynamo.vllm",
"--model", "--model",
os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"), os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
"--is-prefill-worker", "--disaggregation-mode",
"prefill",
"--block-size", "--block-size",
"16", "16",
"--max-model-len", "--max-model-len",
......
...@@ -28,8 +28,8 @@ echo "🔧 Starting dynamo disaggregated serving without LMCache..." ...@@ -28,8 +28,8 @@ echo "🔧 Starting dynamo disaggregated serving without LMCache..."
python -m dynamo.frontend & python -m dynamo.frontend &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL& CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL --disaggregation-mode decode &
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL_URL \ --model $MODEL_URL \
--is-prefill-worker --disaggregation-mode prefill
...@@ -29,7 +29,7 @@ echo "🔧 Starting dynamo disaggregated serving with LMCache enabled..." ...@@ -29,7 +29,7 @@ echo "🔧 Starting dynamo disaggregated serving with LMCache enabled..."
python -m dynamo.frontend & python -m dynamo.frontend &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL& CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model $MODEL_URL --disaggregation-mode decode &
sleep 20 sleep 20
...@@ -40,4 +40,4 @@ LMCACHE_MAX_LOCAL_CPU_SIZE=20 \ ...@@ -40,4 +40,4 @@ LMCACHE_MAX_LOCAL_CPU_SIZE=20 \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model $MODEL_URL \ --model $MODEL_URL \
--is-prefill-worker --disaggregation-mode prefill
...@@ -90,7 +90,7 @@ spec: ...@@ -90,7 +90,7 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --disaggregation-mode decode 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker: VllmPrefillWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -137,4 +137,4 @@ spec: ...@@ -137,4 +137,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
...@@ -137,4 +137,4 @@ spec: ...@@ -137,4 +137,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
...@@ -196,7 +196,8 @@ spec: ...@@ -196,7 +196,8 @@ spec:
- dynamo.vllm - dynamo.vllm
- --model - --model
- nvidia/Llama-3.1-8B-Instruct-FP8 - nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker - --disaggregation-mode
- prefill
- --no-enable-prefix-caching - --no-enable-prefix-caching
- --block-size - --block-size
- "128" - "128"
...@@ -90,7 +90,7 @@ spec: ...@@ -90,7 +90,7 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 --disaggregation-mode decode 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker: VllmPrefillWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -137,4 +137,4 @@ spec: ...@@ -137,4 +137,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log"
...@@ -137,4 +137,4 @@ spec: ...@@ -137,4 +137,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --disaggregation-mode prefill 2>&1 | tee /tmp/vllm.log"
...@@ -77,4 +77,5 @@ spec: ...@@ -77,4 +77,5 @@ spec:
- dynamo.vllm - dynamo.vllm
- --model - --model
- nvidia/Llama-3.1-8B-Instruct-FP8 - nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker - --disaggregation-mode
- prefill
...@@ -49,6 +49,8 @@ spec: ...@@ -49,6 +49,8 @@ spec:
- dynamo.vllm - dynamo.vllm
- --model - --model
- nvidia/Llama-3.1-8B-Instruct-FP8 - nvidia/Llama-3.1-8B-Instruct-FP8
- --disaggregation-mode
- decode
VllmPrefillWorker: VllmPrefillWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -68,4 +70,5 @@ spec: ...@@ -68,4 +70,5 @@ spec:
- dynamo.vllm - dynamo.vllm
- --model - --model
- nvidia/Llama-3.1-8B-Instruct-FP8 - nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker - --disaggregation-mode
- prefill
...@@ -125,9 +125,9 @@ def _build_mocker_command( ...@@ -125,9 +125,9 @@ def _build_mocker_command(
# Add worker type flag for disaggregated mode # Add worker type flag for disaggregated mode
if worker_type == "prefill": if worker_type == "prefill":
command.append("--is-prefill-worker") command.extend(["--disaggregation-mode", "prefill"])
elif worker_type == "decode": elif worker_type == "decode":
command.append("--is-decode-worker") command.extend(["--disaggregation-mode", "decode"])
# Add individual CLI arguments from mocker_args # Add individual CLI arguments from mocker_args
if "speedup_ratio" in mocker_args: if "speedup_ratio" in mocker_args:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment