refactor: vllm v1 examples (#1756)

Signed-off-by: Alec <35311602+alec-flowers@users.noreply.github.com> Co-authored-by: ptarasiewiczNV <104908264+ptarasiewiczNV@users.noreply.github.com> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>

refactor: vllm v1 examples (#1756)
Signed-off-by: Alec <35311602+alec-flowers@users.noreply.github.com> Co-authored-by: ptarasiewiczNV <104908264+ptarasiewiczNV@users.noreply.github.com> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>
f242b455 · Alec · GitHub · 3b3d0f2e · 3b3d0f2e · f242b455
Unverified Commit f242b455 authored Jul 10, 2025 by Alec Committed by GitHub Jul 10, 2025
12 changed files
--- a/examples/vllm_v1/graphs/disagg_planner.py
+++ b/examples/vllm_v1/graphs/disagg_planner.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.simple_load_balancer import SimpleLoadBalancer
-from components.worker import VllmDecodeWorker, VllmPrefillWorker
-
-from dynamo.planner.planner_sla import Planner
-from dynamo.planner.prometheus import Prometheus
-
-load_balancer = Frontend.link(SimpleLoadBalancer)
-load_balancer.link(VllmPrefillWorker)
-load_balancer.link(VllmDecodeWorker)
-
-Frontend.link(Planner)
-Frontend.link(Prometheus)
--- a/examples/vllm_v1/launch/agg.sh
+++ b/examples/vllm_v1/launch/agg.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# run ingress
+dynamo run in=http out=dyn &
+
+# run worker
+python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager
--- a/examples/vllm_v1/launch/agg_router.sh
+++ b/examples/vllm_v1/launch/agg_router.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# run ingress
+dynamo run in=http out=dyn --router-mode kv &
+
+# run workers
+CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+
+CUDA_VISIBLE_DEVICES=1 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager
--- a/examples/vllm_v1/launch/dep.sh
+++ b/examples/vllm_v1/launch/dep.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# run ingress
+dynamo run in=http out=dyn --router-mode kv &
+
+# Data Parallel Attention / Expert Parallelism
+# Routing to DP workers managed by Dynamo
+# Chose Qwen3-30B because its a small MOE that can fit on smaller GPUs (L40S for example)
+for i in {0..3}; do
+    CUDA_VISIBLE_DEVICES=$i python3 components/main.py \
+    --model Qwen/Qwen3-30B-A3B \
+    --data-parallel-rank $i \
+    --data-parallel-size 4 \
+    --enable-expert-parallel \
+    --enforce-eager \
+    --kv-events-port 49500 &
+done
+
+echo "All workers starting. (press Ctrl+C to stop)..."
+wait
--- a/examples/vllm_v1/launch/disagg.sh
+++ b/examples/vllm_v1/launch/disagg.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# run ingress
+dynamo run in=http out=dyn &
+
+CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+
+CUDA_VISIBLE_DEVICES=1 python3 components/main.py \
+    --model Qwen/Qwen3-0.6B \
+    --enforce-eager \
+    --is-prefill-worker
--- a/examples/vllm_v1/launch/disagg_router.sh
+++ b/examples/vllm_v1/launch/disagg_router.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# run ingress
+dynamo run in=http out=dyn --router-mode kv &
+
+# routing will happen between the two decode workers
+CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+
+CUDA_VISIBLE_DEVICES=1 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+
+CUDA_VISIBLE_DEVICES=2 python3 components/main.py \
+    --model Qwen/Qwen3-0.6B \
+    --enforce-eager \
+    --is-prefill-worker
--- a/examples/vllm_v1/launch/dsr1_dep.sh
+++ b/examples/vllm_v1/launch/dsr1_dep.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -ex
+
+# Default values
+NUM_NODES=""
+NODE_RANK=""
+GPUS_PER_NODE=""
+MASTER_ADDR="localhost"
+LOG_DIR="./logs"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --num-nodes)
+            NUM_NODES="$2"
+            shift 2
+            ;;
+        --node-rank)
+            NODE_RANK="$2"
+            shift 2
+            ;;
+        --gpus-per-node)
+            GPUS_PER_NODE="$2"
+            shift 2
+            ;;
+        --master-addr)
+            MASTER_ADDR="$2"
+            shift 2
+            ;;
+        --log-dir)
+            LOG_DIR="$2"
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --num-nodes N         Number of nodes in the cluster (required, int)"
+            echo "  --node-rank M         Rank of this node (0-based, required, int)"
+            echo "  --gpus-per-node L     Number of GPUs per node (required, int)"
+            echo "  --master-addr ADDR    Master node address (default: localhost)"
+            echo "  --log-dir DIR         Directory for log files (default: ./logs)"
+            echo "  -h, --help           Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required arguments
+if [ -z "$NUM_NODES" ] || [ -z "$NODE_RANK" ] || [ -z "$GPUS_PER_NODE" ]; then
+    echo "Error: Missing required arguments"
+    echo "Required: --num-nodes, --node-rank, --gpus-per-node"
+    echo "Use --help for usage information"
+    exit 1
+fi
+
+# Calculate data parallel size
+DATA_PARALLEL_SIZE=$((NUM_NODES * GPUS_PER_NODE))
+
+echo "Configuration:"
+echo "  Number of nodes: $NUM_NODES"
+echo "  Node rank: $NODE_RANK"
+echo "  GPUs per node: $GPUS_PER_NODE"
+echo "  Data parallel size: $DATA_PARALLEL_SIZE"
+echo "  Master address: $MASTER_ADDR"
+echo "  Log directory: $LOG_DIR"
+
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# run ingress if it's node 0
+if [ $NODE_RANK -eq 0 ]; then
+    DYN_LOG=debug dynamo-run in=http out=dyn --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
+fi
+
+mkdir -p $LOG_DIR
+
+# Data Parallel Attention / Expert Parallelism
+# Routing to DP workers managed by Dynamo
+for ((i=0; i<GPUS_PER_NODE; i++)); do
+    dp_rank=$((i + NODE_RANK * GPUS_PER_NODE))
+    CUDA_VISIBLE_DEVICES=$i \
+        VLLM_ALL2ALL_BACKEND="deepep_low_latency" \
+        VLLM_USE_DEEP_GEMM=1 \
+        VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 \
+        python3 components/main.py \
+        --model deepseek-ai/DeepSeek-R1 \
+        --data_parallel_size $DATA_PARALLEL_SIZE \
+        --data-parallel-rank $dp_rank \
+        --enable-expert-parallel \
+        --max-model-len 10240 \
+        --data-parallel-address $MASTER_ADDR \
+        --data-parallel-rpc-port 13345 \
+        --gpu-memory-utilization 0.95 \
+        --enforce-eager \
+        --kv-events-port 49700 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}.log &
+done
+
+echo "All workers starting. (press Ctrl+C to stop)..."
+wait
--- a/examples/vllm_v1/multi-node.md
+++ b/examples/vllm_v1/multi-node.md
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Multi-node Examples
+
+This guide covers deploying vLLM across multiple nodes using Dynamo's distributed capabilities.
+
+## Prerequisites
+
+Multi-node deployments require:
+- Multiple nodes with GPU resources
+- Network connectivity between nodes (faster the better)
+- Firewall rules allowing NATS/ETCD communication
+
+## Infrastructure Setup
+
+### Step 1: Start NATS/ETCD on Head Node
+
+Start the required services on your head node. These endpoints must be accessible from all worker nodes:
+
+```bash
+# On head node (node-1)
+docker compose -f deploy/metrics/docker-compose.yml up -d
+```
+
+Default ports:
+- NATS: 4222
+- ETCD: 2379
+
+### Step 2: Configure Environment Variables
+
+Set the head node IP address and service endpoints. **Set this on all nodes** for easy copy-paste:
+
+```bash
+# Set this on ALL nodes - replace with your actual head node IP
+export HEAD_NODE_IP="<your-head-node-ip>"
+
+# Service endpoints (set on all nodes)
+export NATS_SERVER="nats://${HEAD_NODE_IP}:4222"
+export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379"
+```
+
+## Deployment Patterns
+
+### Multi-node Aggregated Serving
+
+Deploy vLLM workers across multiple nodes for horizontal scaling:
+
+**Node 1 (Head Node)**: Run ingress and first worker
+```bash
+# Start ingress
+dynamo run in=http out=dyn
+
+# Start vLLM worker
+python3 components/main.py \
+  --model meta-llama/Llama-3.3-70B-Instruct \
+  --tensor-parallel-size 8 \
+  --enforce-eager
+```
+
+**Node 2**: Run additional worker
+```bash
+# Start vLLM worker
+python3 components/main.py \
+  --model meta-llama/Llama-3.3-70B-Instruct \
+  --tensor-parallel-size 8 \
+  --enforce-eager
+```
+
+### Multi-node Disaggregated Serving
+
+Deploy prefill and decode workers on separate nodes for optimized resource utilization:
+
+**Node 1**: Run ingress and prefill workers
+```bash
+# Start ingress
+dynamo run in=http out=dyn &
+
+# Start prefill worker
+python3 components/main.py \
+  --model meta-llama/Llama-3.3-70B-Instruct
+  --tensor-parallel-size 8 \
+  --enforce-eager
+```
+
+**Node 2**: Run decode workers
+```bash
+# Start decode worker
+python3 components/main.py \
+  --model meta-llama/Llama-3.3-70B-Instruct
+  --tensor-parallel-size 8 \
+  --enforce-eager \
+  --is-prefill-worker
+```
+
+
+## TODO
+
+## Large Model Deployment
+
+For models requiring more GPUs than available on a single node such as tensor-parallel-size 16:
+
+**Node 1**: First part of tensor-parallel model
+```bash
+# Start ingress
+dynamo run in=http out=dyn &
+```
+
--- a/examples/vllm_v1/utils/args.py
+++ b/examples/vllm_v1/utils/args.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: rename to avoid ambiguity with vllm package
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.utils import FlexibleArgumentParser
-
-from dynamo.sdk.lib.config import ServiceConfig
-
-
-def parse_vllm_args(service_name, prefix) -> AsyncEngineArgs:
-    config = ServiceConfig.get_instance()
-    vllm_args = config.as_args(service_name, prefix=prefix)
-    parser = FlexibleArgumentParser()
-    parser.add_argument(
-        "--enable-disagg", action="store_true", help="Enable disaggregation"
-    )
-    parser.add_argument(
-        "--data-parallel-start-rank",
-        "-dpr",
-        type=int,
-        default=0,
-        help="Starting data parallel rank for secondary nodes.",
-    )
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args(vllm_args)
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine_args.enable_disagg = args.enable_disagg
-    engine_args.data_parallel_start_rank = args.data_parallel_start_rank
-    return engine_args
--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -46,7 +46,7 @@ class DistributedRuntime:

    ...

-    def namespace(self, name: str, path: str) -> Namespace:
+    def namespace(self, name: str) -> Namespace:
        """
        Create a `Namespace` object
        """
@@ -428,7 +428,7 @@ class WorkerMetricsPublisher:
        Create a `WorkerMetricsPublisher` object
        """

-    def create_service(self, component: Component) -> None:
+    def create_endpoint(self, component: Component) -> None:
        """
        Similar to Component.create_service, but only service created through
        this method will interact with KV router of the same component.

--- a/lib/llm/src/kv_router/publisher.rs
+++ b/lib/llm/src/kv_router/publisher.rs
@@ -151,7 +151,6 @@ impl KvEventPublisher {
    }

    pub fn publish(&self, event: KvCacheEvent) -> Result<(), mpsc::error::SendError<KvCacheEvent>> {
-        tracing::trace!("Publish event: {:?}", event);
        self.tx.send(event)
    }

@@ -195,6 +194,7 @@ async fn start_event_processor<P: EventPublisher + Send + Sync + 'static>(
                };

                // Encapsulate in a router event and publish.
+                tracing::trace!("Event processor for worker_id {} processing event: {:?}", worker_id, event.data);
                let router_event = RouterEvent::new(worker_id, event);
                if let Err(e) = publisher.publish(KV_EVENT_SUBJECT, &router_event).await {
                    tracing::error!("Failed to publish event: {}", e);
@@ -247,15 +247,19 @@ pub async fn start_zmq_listener(
    }

    let mut consecutive_errors = 0u32;
+    #[allow(unused_assignments)]
+    let mut exit_reason = "unknown";
+    let mut messages_processed = 0u64;

-    loop {
+    'main: loop {
        tokio::select! {
            biased;

            // Check for cancellation
            _ = cancellation_token.cancelled() => {
-                tracing::info!("ZMQ listener received cancellation signal");
-                break;
+                tracing::debug!("ZMQ listener received cancellation signal");
+                exit_reason = "cancellation token cancelled";
+                break 'main;
            }

            // Receive message
@@ -270,7 +274,8 @@ pub async fn start_zmq_listener(
                            consecutive_errors=%consecutive_errors,
                            "Too many consecutive ZMQ errors, terminating listener"
                        );
-                        break;
+                        exit_reason = "too many consecutive errors";
+                        break 'main;
                    }

                    // Simple exponential backoff with max exponent to prevent overflow
@@ -316,18 +321,29 @@ pub async fn start_zmq_listener(
                    continue;
                };

-                // For each of our events, convert them to [`KvCacheEvent`] and send to the event_processor.
+                tracing::trace!(
+                    "ZMQ listener on {} received batch with {} events (seq={})",
+                    zmq_endpoint,
+                    batch.events.len(),
+                    seq
+                );
                for raw_event in batch.events.into_iter() {
                    let event = convert_event(raw_event, seq, kv_block_size, &warning_count);
                    if tx.send(event).is_err() {
                        tracing::warn!("Failed to send message to channel - receiver dropped");
-                        return;
+                        exit_reason = "channel receiver dropped";
+                        break 'main;
                    }
+                    messages_processed += 1;
                }
            }
        }
-        tracing::debug!("ZMQ listener exiting");
    }
+    tracing::debug!(
+        "ZMQ listener exiting, reason: {}, messages processed: {}",
+        exit_reason,
+        messages_processed
+    );
 }

 /// Convert a raw event coming from the ZMQ channel into the internal
@@ -438,6 +454,8 @@ pub fn create_stored_blocks(
 struct KvEventBatch {
    ts: f64,
    events: Vec<RawKvEvent>,
+    #[serde(alias = "dp_rank")]
+    data_parallel_rank: u32, // we are ignoring this for now
 }

 #[derive(Debug, Deserialize, Serialize)]
@@ -770,7 +788,11 @@ mod tests_startup_helpers {
            lora_id: None,
        }];

-        let batch = KvEventBatch { ts: 0.0, events };
+        let batch = KvEventBatch {
+            ts: 0.0,
+            events,
+            data_parallel_rank: 1,
+        };

        let payload = Bytes::from(rmps::to_vec(&batch).unwrap());


--- a/lib/llm/src/kv_router/scheduler.rs
+++ b/lib/llm/src/kv_router/scheduler.rs
@@ -395,8 +395,9 @@ impl WorkerSelector for DefaultWorkerSelector {
            worker_logits.insert(*worker_id, logit);

            tracing::info!(
-                "Formula for {worker_id}: {logit:.3} = {:.1} * {prefill_blocks:.3} + {potential_blocks:.3}",
+                "Formula for {worker_id}: {logit:.3} = {:.1} * {prefill_blocks:.3} + {potential_blocks:.3}  (cached_blocks: {cached_blocks})",
                self.kv_router_config.overlap_score_weight,
+                cached_blocks = cached_blocks
            );
        }