Unverified Commit 4ebb244b authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: add --headless mode for multi-node TP/PP in dynamo.vllm (#6204)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent d3ee09d7
...@@ -266,6 +266,17 @@ class DynamoVllmArgGroup(ArgGroup): ...@@ -266,6 +266,17 @@ class DynamoVllmArgGroup(ArgGroup):
help="Number of GPUs used for classifier free guidance parallelism.", help="Number of GPUs used for classifier free guidance parallelism.",
) )
# Headless mode for multi-node TP/PP
add_negatable_bool_argument(
g,
flag_name="--headless",
env_var="DYN_VLLM_HEADLESS",
default=False,
help="Run in headless mode for multi-node TP/PP. "
"Secondary nodes run vLLM workers only, no dynamo endpoints. "
"See vLLM multi-node data parallel documentation for more details.",
)
# ModelExpress P2P # ModelExpress P2P
add_argument( add_argument(
g, g,
...@@ -319,6 +330,9 @@ class DynamoVllmConfig(ConfigBase): ...@@ -319,6 +330,9 @@ class DynamoVllmConfig(ConfigBase):
ring_degree: int = 1 ring_degree: int = 1
cfg_parallel_size: int = 1 cfg_parallel_size: int = 1
# Headless mode for multi-node TP/PP
headless: bool = False
# ModelExpress P2P # ModelExpress P2P
model_express_url: Optional[str] = None model_express_url: Optional[str] = None
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import argparse
import asyncio import asyncio
import logging import logging
import os import os
...@@ -11,6 +12,7 @@ from typing import Optional ...@@ -11,6 +12,7 @@ from typing import Optional
import uvloop import uvloop
from prometheus_client import REGISTRY, CollectorRegistry, multiprocess from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
from vllm.distributed.kv_events import ZmqEventPublisher from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.entrypoints.cli.serve import run_headless
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
...@@ -89,6 +91,30 @@ async def graceful_shutdown(runtime, shutdown_event): ...@@ -89,6 +91,30 @@ async def graceful_shutdown(runtime, shutdown_event):
logging.info("DistributedRuntime shutdown complete") logging.info("DistributedRuntime shutdown complete")
def build_headless_namespace(config: Config) -> argparse.Namespace:
"""Build an argparse Namespace from engine_args for vLLM's run_headless().
run_headless() expects the raw CLI namespace. We reconstruct it from
the already-parsed AsyncEngineArgs so parse_args() doesn't need to
leak transport details.
"""
ns = argparse.Namespace(**vars(config.engine_args))
# run_headless() reads api_server_count; default to 0 (no API server)
if not hasattr(ns, "api_server_count"):
ns.api_server_count = 0
return ns
def run_dynamo_headless(config: Config) -> None:
"""Run in headless mode for multi-node TP/PP.
Secondary nodes spawn vLLM workers only — no engine core, no scheduler,
no Dynamo endpoints. Bypasses DistributedRuntime entirely (no NATS/etcd).
"""
args = build_headless_namespace(config)
run_headless(args)
async def worker(): async def worker():
config = parse_args() config = parse_args()
...@@ -116,6 +142,18 @@ async def worker(): ...@@ -116,6 +142,18 @@ async def worker():
if not os.path.exists(config.model): if not os.path.exists(config.model):
await fetch_model(config.model) await fetch_model(config.model)
# HEADLESS MODE: bypass DistributedRuntime entirely.
# Workers run vLLM only (no NATS, etcd, or dynamo endpoints).
if config.headless:
if checkpoint_cfg is not None:
raise ValueError(
"--headless is incompatible with checkpoint mode "
"(DYN_CHECKPOINT_SIGNAL_FILE is set). "
"Remove --headless or unset DYN_CHECKPOINT_SIGNAL_FILE."
)
run_dynamo_headless(config)
return
# CHECKPOINT MODE: Load engine BEFORE runtime creation # CHECKPOINT MODE: Load engine BEFORE runtime creation
# This allows checkpointing GPU state before runtime connections are established # This allows checkpointing GPU state before runtime connections are established
pre_created_engine = None pre_created_engine = None
......
...@@ -193,3 +193,26 @@ def test_endpoint_invalid_format_raises(mock_vllm_cli): ...@@ -193,3 +193,26 @@ def test_endpoint_invalid_format_raises(mock_vllm_cli):
) )
with pytest.raises(ValueError, match="Invalid endpoint format"): with pytest.raises(ValueError, match="Invalid endpoint format"):
parse_args() parse_args()
def test_headless_namespace_has_required_fields(mock_vllm_cli):
"""Test that build_headless_namespace produces a Namespace with fields
required by vLLM's run_headless(), including the api_server_count fallback."""
mock_vllm_cli(
"--model",
"Qwen/Qwen3-0.6B",
"--headless",
)
config = parse_args()
assert config.headless is True
from dynamo.vllm.main import build_headless_namespace
ns = build_headless_namespace(config)
# Required by run_headless()
assert hasattr(ns, "api_server_count")
assert ns.api_server_count == 0
# Core engine fields must survive the round-trip
assert hasattr(ns, "model")
assert hasattr(ns, "tensor_parallel_size")
...@@ -79,19 +79,31 @@ Deploy prefill and decode workers on separate nodes for optimized resource utili ...@@ -79,19 +79,31 @@ Deploy prefill and decode workers on separate nodes for optimized resource utili
# Start ingress # Start ingress
python -m dynamo.frontend --router-mode kv & python -m dynamo.frontend --router-mode kv &
# Start prefill worker # Start decode worker
python -m dynamo.vllm \ python -m dynamo.vllm \
--model meta-llama/Llama-3.3-70B-Instruct --model meta-llama/Llama-3.3-70B-Instruct \
--tensor-parallel-size 8 \ --tensor-parallel-size 8 \
--enforce-eager --enforce-eager \
--is-decode-worker
``` ```
**Node 2**: Run prefill worker **Node 2**: Run prefill worker
```bash ```bash
# Start decode worker # Start prefill worker
python -m dynamo.vllm \ python -m dynamo.vllm \
--model meta-llama/Llama-3.3-70B-Instruct --model meta-llama/Llama-3.3-70B-Instruct \
--tensor-parallel-size 8 \ --tensor-parallel-size 8 \
--enforce-eager \ --enforce-eager \
--is-prefill-worker --is-prefill-worker
``` ```
### Multi-node Tensor/Pipeline Parallelism
When the total parallelism (TP × PP) exceeds the number of GPUs on a single node,
you need multiple nodes to host a **single** model instance. One node runs the full
`dynamo.vllm` process (head) while additional nodes run in `--headless` mode,
spawning only vLLM workers.
See [`examples/backends/vllm/launch/multi_node_tp.sh`](https://github.com/ai-dynamo/dynamo/blob/main/examples/backends/vllm/launch/multi_node_tp.sh) for a ready-to-use launch script that supports both head and worker roles via `--head` / `--worker` flags. The model, TP size, and node count are configurable via `MODEL`, `TENSOR_PARALLEL_SIZE`, and `NNODES` environment variables.
For details on the flags used for multi-node distributed execution (`--master-addr`, `--master-port`, `--nnodes`, `--node-rank`), see the [vLLM multiprocessing docs](https://docs.vllm.ai/en/stable/serving/parallelism_scaling/#running-vllm-with-multiprocessing).
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Multi-node TP deployment with dynamo.vllm
#
# Single script for both head and worker roles.
#
# Usage:
# Head node:
# bash multi_node_tp.sh --head --head-ip 10.0.0.1
#
# Worker node:
# bash multi_node_tp.sh --worker --head-ip 10.0.0.1
#
# Prerequisites:
# - 8 GPUs per node
# - Head: NATS and etcd running (on this node or reachable)
# - Worker: torch.distributed connectivity to head node
# - Worker: head node must be started first
set -e
trap 'echo "Cleaning up..."; kill 0' EXIT
MODEL="${MODEL:-meta-llama/Llama-3.1-8B-Instruct}"
TP="${TENSOR_PARALLEL_SIZE:-16}"
NNODES="${NNODES:-2}"
ROLE=""
HEAD_IP=""
usage() {
echo "Usage: $0 (--head | --worker) --head-ip <IP>"
exit 1
}
while [[ $# -gt 0 ]]; do
case "$1" in
--head) ROLE="head"; shift ;;
--worker) ROLE="worker"; shift ;;
--head-ip)
HEAD_IP="$2"
shift 2
;;
*) echo "Unknown option: $1"; usage ;;
esac
done
[[ -z "${ROLE}" ]] && { echo "Error: specify --head or --worker"; usage; }
[[ -z "${HEAD_IP}" ]] && { echo "Error: --head-ip is required"; usage; }
if [[ "${ROLE}" == "head" ]]; then
echo "Starting Dynamo frontend..."
python3 -m dynamo.frontend &
echo "Starting dynamo.vllm head node (TP=${TP}, nnodes=${NNODES}, node-rank=0)..."
python3 -m dynamo.vllm \
--model "${MODEL}" \
--tensor-parallel-size "${TP}" \
--nnodes "${NNODES}" \
--node-rank 0 \
--master-addr "${HEAD_IP}" \
--enforce-eager &
wait
else
echo "Starting dynamo.vllm headless worker (TP=${TP}, nnodes=${NNODES}, node-rank=1)..."
python3 -m dynamo.vllm \
--model "${MODEL}" \
--tensor-parallel-size "${TP}" \
--nnodes "${NNODES}" \
--node-rank 1 \
--master-addr "${HEAD_IP}" \
--enforce-eager \
--headless
fi
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Single-machine 2-GPU test for multi-node TP with --headless flag.
#
# Launches frontend + head (node-rank=0, GPU 0) + headless worker (node-rank=1, GPU 1)
# on localhost to validate the headless code path without requiring multiple machines.
set -e
trap 'echo "Cleaning up..."; kill 0' EXIT
MODEL="${MODEL:-Qwen/Qwen3-0.6B}"
echo "Starting Dynamo frontend..."
python3 -m dynamo.frontend &
echo "Starting dynamo.vllm head node (TP=2, nnodes=2, node-rank=0, GPU 0)..."
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model "${MODEL}" \
--tensor-parallel-size 2 \
--nnodes 2 \
--node-rank 0 \
--master-addr 127.0.0.1 \
--enforce-eager &
echo "Starting dynamo.vllm headless worker (TP=2, nnodes=2, node-rank=1, GPU 1)..."
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model "${MODEL}" \
--tensor-parallel-size 2 \
--nnodes 2 \
--node-rank 1 \
--master-addr 127.0.0.1 \
--enforce-eager \
--headless &
wait
...@@ -665,6 +665,21 @@ vllm_configs = { ...@@ -665,6 +665,21 @@ vllm_configs = {
completion_payload_default(), completion_payload_default(),
], ],
), ),
"multi_node_tp_headless": VLLMConfig(
name="multi_node_tp_headless",
directory=os.path.join(WORKSPACE_DIR, "tests/serve"),
script_name="multi_node_tp_headless.sh",
marks=[
pytest.mark.gpu_2,
pytest.mark.post_merge,
pytest.mark.timeout(300),
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
"guided_decoding": VLLMConfig( "guided_decoding": VLLMConfig(
name="guided_decoding", name="guided_decoding",
directory=vllm_dir, directory=vllm_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment