Unverified Commit f242b455 authored by Alec's avatar Alec Committed by GitHub
Browse files
parent 3b3d0f2e
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.simple_load_balancer import SimpleLoadBalancer
from components.worker import VllmDecodeWorker, VllmPrefillWorker
from dynamo.planner.planner_sla import Planner
from dynamo.planner.prometheus import Prometheus
load_balancer = Frontend.link(SimpleLoadBalancer)
load_balancer.link(VllmPrefillWorker)
load_balancer.link(VllmDecodeWorker)
Frontend.link(Planner)
Frontend.link(Prometheus)
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
dynamo run in=http out=dyn &
# run worker
python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
dynamo run in=http out=dyn --router-mode kv &
# run workers
CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
CUDA_VISIBLE_DEVICES=1 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
dynamo run in=http out=dyn --router-mode kv &
# Data Parallel Attention / Expert Parallelism
# Routing to DP workers managed by Dynamo
# Chose Qwen3-30B because its a small MOE that can fit on smaller GPUs (L40S for example)
for i in {0..3}; do
CUDA_VISIBLE_DEVICES=$i python3 components/main.py \
--model Qwen/Qwen3-30B-A3B \
--data-parallel-rank $i \
--data-parallel-size 4 \
--enable-expert-parallel \
--enforce-eager \
--kv-events-port 49500 &
done
echo "All workers starting. (press Ctrl+C to stop)..."
wait
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
dynamo run in=http out=dyn &
CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
CUDA_VISIBLE_DEVICES=1 python3 components/main.py \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--is-prefill-worker
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress
dynamo run in=http out=dyn --router-mode kv &
# routing will happen between the two decode workers
CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
CUDA_VISIBLE_DEVICES=1 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
CUDA_VISIBLE_DEVICES=2 python3 components/main.py \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--is-prefill-worker
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -ex
# Default values
NUM_NODES=""
NODE_RANK=""
GPUS_PER_NODE=""
MASTER_ADDR="localhost"
LOG_DIR="./logs"
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--num-nodes)
NUM_NODES="$2"
shift 2
;;
--node-rank)
NODE_RANK="$2"
shift 2
;;
--gpus-per-node)
GPUS_PER_NODE="$2"
shift 2
;;
--master-addr)
MASTER_ADDR="$2"
shift 2
;;
--log-dir)
LOG_DIR="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --num-nodes N Number of nodes in the cluster (required, int)"
echo " --node-rank M Rank of this node (0-based, required, int)"
echo " --gpus-per-node L Number of GPUs per node (required, int)"
echo " --master-addr ADDR Master node address (default: localhost)"
echo " --log-dir DIR Directory for log files (default: ./logs)"
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Validate required arguments
if [ -z "$NUM_NODES" ] || [ -z "$NODE_RANK" ] || [ -z "$GPUS_PER_NODE" ]; then
echo "Error: Missing required arguments"
echo "Required: --num-nodes, --node-rank, --gpus-per-node"
echo "Use --help for usage information"
exit 1
fi
# Calculate data parallel size
DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE))
echo "Configuration:"
echo " Number of nodes: $NUM_NODES"
echo " Node rank: $NODE_RANK"
echo " GPUs per node: $GPUS_PER_NODE"
echo " Data parallel size: $DATA_PARALLEL_SIZE"
echo " Master address: $MASTER_ADDR"
echo " Log directory: $LOG_DIR"
trap 'echo Cleaning up...; kill 0' EXIT
# run ingress if it's node 0
if [ $NODE_RANK -eq 0 ]; then
DYN_LOG=debug dynamo-run in=http out=dyn --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
fi
mkdir -p $LOG_DIR
# Data Parallel Attention / Expert Parallelism
# Routing to DP workers managed by Dynamo
for ((i=0; i<GPUS_PER_NODE; i++)); do
dp_rank=$((i + NODE_RANK * GPUS_PER_NODE))
CUDA_VISIBLE_DEVICES=$i \
VLLM_ALL2ALL_BACKEND="deepep_low_latency" \
VLLM_USE_DEEP_GEMM=1 \
VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 \
python3 components/main.py \
--model deepseek-ai/DeepSeek-R1 \
--data_parallel_size $DATA_PARALLEL_SIZE \
--data-parallel-rank $dp_rank \
--enable-expert-parallel \
--max-model-len 10240 \
--data-parallel-address $MASTER_ADDR \
--data-parallel-rpc-port 13345 \
--gpu-memory-utilization 0.95 \
--enforce-eager \
--kv-events-port 49700 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}.log &
done
echo "All workers starting. (press Ctrl+C to stop)..."
wait
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Multi-node Examples
This guide covers deploying vLLM across multiple nodes using Dynamo's distributed capabilities.
## Prerequisites
Multi-node deployments require:
- Multiple nodes with GPU resources
- Network connectivity between nodes (faster the better)
- Firewall rules allowing NATS/ETCD communication
## Infrastructure Setup
### Step 1: Start NATS/ETCD on Head Node
Start the required services on your head node. These endpoints must be accessible from all worker nodes:
```bash
# On head node (node-1)
docker compose -f deploy/metrics/docker-compose.yml up -d
```
Default ports:
- NATS: 4222
- ETCD: 2379
### Step 2: Configure Environment Variables
Set the head node IP address and service endpoints. **Set this on all nodes** for easy copy-paste:
```bash
# Set this on ALL nodes - replace with your actual head node IP
export HEAD_NODE_IP="<your-head-node-ip>"
# Service endpoints (set on all nodes)
export NATS_SERVER="nats://${HEAD_NODE_IP}:4222"
export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379"
```
## Deployment Patterns
### Multi-node Aggregated Serving
Deploy vLLM workers across multiple nodes for horizontal scaling:
**Node 1 (Head Node)**: Run ingress and first worker
```bash
# Start ingress
dynamo run in=http out=dyn
# Start vLLM worker
python3 components/main.py \
--model meta-llama/Llama-3.3-70B-Instruct \
--tensor-parallel-size 8 \
--enforce-eager
```
**Node 2**: Run additional worker
```bash
# Start vLLM worker
python3 components/main.py \
--model meta-llama/Llama-3.3-70B-Instruct \
--tensor-parallel-size 8 \
--enforce-eager
```
### Multi-node Disaggregated Serving
Deploy prefill and decode workers on separate nodes for optimized resource utilization:
**Node 1**: Run ingress and prefill workers
```bash
# Start ingress
dynamo run in=http out=dyn &
# Start prefill worker
python3 components/main.py \
--model meta-llama/Llama-3.3-70B-Instruct
--tensor-parallel-size 8 \
--enforce-eager
```
**Node 2**: Run decode workers
```bash
# Start decode worker
python3 components/main.py \
--model meta-llama/Llama-3.3-70B-Instruct
--tensor-parallel-size 8 \
--enforce-eager \
--is-prefill-worker
```
## TODO
## Large Model Deployment
For models requiring more GPUs than available on a single node such as tensor-parallel-size 16:
**Node 1**: First part of tensor-parallel model
```bash
# Start ingress
dynamo run in=http out=dyn &
```
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: rename to avoid ambiguity with vllm package
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser
from dynamo.sdk.lib.config import ServiceConfig
def parse_vllm_args(service_name, prefix) -> AsyncEngineArgs:
config = ServiceConfig.get_instance()
vllm_args = config.as_args(service_name, prefix=prefix)
parser = FlexibleArgumentParser()
parser.add_argument(
"--enable-disagg", action="store_true", help="Enable disaggregation"
)
parser.add_argument(
"--data-parallel-start-rank",
"-dpr",
type=int,
default=0,
help="Starting data parallel rank for secondary nodes.",
)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args(vllm_args)
engine_args = AsyncEngineArgs.from_cli_args(args)
engine_args.enable_disagg = args.enable_disagg
engine_args.data_parallel_start_rank = args.data_parallel_start_rank
return engine_args
......@@ -46,7 +46,7 @@ class DistributedRuntime:
...
def namespace(self, name: str, path: str) -> Namespace:
def namespace(self, name: str) -> Namespace:
"""
Create a `Namespace` object
"""
......@@ -428,7 +428,7 @@ class WorkerMetricsPublisher:
Create a `WorkerMetricsPublisher` object
"""
def create_service(self, component: Component) -> None:
def create_endpoint(self, component: Component) -> None:
"""
Similar to Component.create_service, but only service created through
this method will interact with KV router of the same component.
......
......@@ -151,7 +151,6 @@ impl KvEventPublisher {
}
pub fn publish(&self, event: KvCacheEvent) -> Result<(), mpsc::error::SendError<KvCacheEvent>> {
tracing::trace!("Publish event: {:?}", event);
self.tx.send(event)
}
......@@ -195,6 +194,7 @@ async fn start_event_processor<P: EventPublisher + Send + Sync + 'static>(
};
// Encapsulate in a router event and publish.
tracing::trace!("Event processor for worker_id {} processing event: {:?}", worker_id, event.data);
let router_event = RouterEvent::new(worker_id, event);
if let Err(e) = publisher.publish(KV_EVENT_SUBJECT, &router_event).await {
tracing::error!("Failed to publish event: {}", e);
......@@ -247,15 +247,19 @@ pub async fn start_zmq_listener(
}
let mut consecutive_errors = 0u32;
#[allow(unused_assignments)]
let mut exit_reason = "unknown";
let mut messages_processed = 0u64;
loop {
'main: loop {
tokio::select! {
biased;
// Check for cancellation
_ = cancellation_token.cancelled() => {
tracing::info!("ZMQ listener received cancellation signal");
break;
tracing::debug!("ZMQ listener received cancellation signal");
exit_reason = "cancellation token cancelled";
break 'main;
}
// Receive message
......@@ -270,7 +274,8 @@ pub async fn start_zmq_listener(
consecutive_errors=%consecutive_errors,
"Too many consecutive ZMQ errors, terminating listener"
);
break;
exit_reason = "too many consecutive errors";
break 'main;
}
// Simple exponential backoff with max exponent to prevent overflow
......@@ -316,18 +321,29 @@ pub async fn start_zmq_listener(
continue;
};
// For each of our events, convert them to [`KvCacheEvent`] and send to the event_processor.
tracing::trace!(
"ZMQ listener on {} received batch with {} events (seq={})",
zmq_endpoint,
batch.events.len(),
seq
);
for raw_event in batch.events.into_iter() {
let event = convert_event(raw_event, seq, kv_block_size, &warning_count);
if tx.send(event).is_err() {
tracing::warn!("Failed to send message to channel - receiver dropped");
return;
exit_reason = "channel receiver dropped";
break 'main;
}
messages_processed += 1;
}
}
}
tracing::debug!("ZMQ listener exiting");
}
tracing::debug!(
"ZMQ listener exiting, reason: {}, messages processed: {}",
exit_reason,
messages_processed
);
}
/// Convert a raw event coming from the ZMQ channel into the internal
......@@ -438,6 +454,8 @@ pub fn create_stored_blocks(
struct KvEventBatch {
ts: f64,
events: Vec<RawKvEvent>,
#[serde(alias = "dp_rank")]
data_parallel_rank: u32, // we are ignoring this for now
}
#[derive(Debug, Deserialize, Serialize)]
......@@ -770,7 +788,11 @@ mod tests_startup_helpers {
lora_id: None,
}];
let batch = KvEventBatch { ts: 0.0, events };
let batch = KvEventBatch {
ts: 0.0,
events,
data_parallel_rank: 1,
};
let payload = Bytes::from(rmps::to_vec(&batch).unwrap());
......
......@@ -395,8 +395,9 @@ impl WorkerSelector for DefaultWorkerSelector {
worker_logits.insert(*worker_id, logit);
tracing::info!(
"Formula for {worker_id}: {logit:.3} = {:.1} * {prefill_blocks:.3} + {potential_blocks:.3}",
"Formula for {worker_id}: {logit:.3} = {:.1} * {prefill_blocks:.3} + {potential_blocks:.3} (cached_blocks: {cached_blocks})",
self.kv_router_config.overlap_score_weight,
cached_blocks = cached_blocks
);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment