Unverified Commit dd7ceb4a authored by Yi Yao's avatar Yi Yao Committed by GitHub
Browse files

feat: add examples and unit tests for vLLM aggregated serving on XPU (#7146)


Signed-off-by: default avatarYi Yao <yi.a.yao@intel.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent c09ac697
...@@ -40,27 +40,36 @@ Each launch script runs the frontend and worker(s) in a single terminal. You can ...@@ -40,27 +40,36 @@ Each launch script runs the frontend and worker(s) in a single terminal. You can
The simplest deployment pattern: a single worker handles both prefill and decode. Requires 1 GPU. The simplest deployment pattern: a single worker handles both prefill and decode. Requires 1 GPU.
Run on CUDA devices:
```bash ```bash
cd $DYNAMO_HOME/examples/backends/vllm cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg.sh bash launch/agg.sh
``` ```
For XPU deployments, use a larger block size and set it to at least `64` (`>= 64`): Run on XPUs:
```bash ```bash
# XeTLA ChunkPrefill FP8KV: only support block_size >= 64
cd $DYNAMO_HOME/examples/backends/vllm cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg.sh --block-size 64 bash launch/xpu/agg_xpu.sh
``` ```
### Aggregated Serving with KV Routing ### Aggregated Serving with KV Routing
Two workers behind a [KV-aware router](../../components/router/README.md) that maximizes cache reuse. Requires 2 GPUs. Two workers behind a [KV-aware router](../../components/router/README.md) that maximizes cache reuse. Requires 2 GPUs.
Run on CUDA devices:
```bash ```bash
cd $DYNAMO_HOME/examples/backends/vllm cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg_router.sh bash launch/agg_router.sh
``` ```
Run on XPUs:
```bash
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/xpu/agg_router_xpu.sh
```
This launches the frontend in KV routing mode with two workers publishing KV events over ZMQ. This launches the frontend in KV routing mode with two workers publishing KV events over ZMQ.
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
mkdir -p $DYN_LORA_PATH
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Aggregated + LoRA + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA to both instances (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT1}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " curl -s -X POST http://localhost:${SYSTEM_PORT2}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
echo " \"max_tokens\": 32}' | jq ."
echo ""
echo "=========================================="
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT2} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Sample output after running LoRA inference curl request twice.
# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
: <<'SAMPLE_OUTPUT'
{
"id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
"choices": [
{
"index": 0,
"message": {
"content": "<think>\nOkay, so I need to develop a character background for a character named Elara. Let me start by understanding the requirements. The user wants",
"role": "assistant",
"reasoning_content": null
},
"finish_reason": "length"
}
],
"created": 1765230243,
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"object": "chat.completion",
"usage": {
"prompt_tokens": 196,
"completion_tokens": 30,
"total_tokens": 226,
"prompt_tokens_details": {
"audio_tokens": null,
"cached_tokens": 192 # tokens that were cached from the previous request.
}
},
"nvext": {
"worker_id": {
"prefill_worker_id": 7587891281668871552,
"decode_worker_id": 7587891281668871552
}
}
}
SAMPLE_OUTPUT
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
export VLLM_TARGET_DEVICE=xpu
mkdir -p $DYN_LORA_PATH
MODEL="Qwen/Qwen3-0.6B"
SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Test base model inference (for comparison)"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Unload LoRA"
echo " curl -X DELETE http://localhost:${SYSTEM_PORT}/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
echo ""
echo "=========================================="
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var.
python -m dynamo.frontend &
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--enable-lora \
--max-lora-rank 64 &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
# Explicitly set PROMETHEUS_MULTIPROC_DIR (K8s-style deployment)
# Use unique directory per test run to avoid conflicts
export PROMETHEUS_MULTIPROC_DIR=${PROMETHEUS_MULTIPROC_DIR:-/tmp/prometheus_multiproc_$$_$RANDOM}
rm -rf "$PROMETHEUS_MULTIPROC_DIR"
mkdir -p "$PROMETHEUS_MULTIPROC_DIR"
# Cleanup function to remove the directory on exit
cleanup() {
echo "Cleaning up..."
rm -rf "$PROMETHEUS_MULTIPROC_DIR"
kill 0
}
trap cleanup EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
unset PROMETHEUS_MULTIPROC_DIR
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated multimodal serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Default values
MODEL_NAME="Qwen/Qwen3-VL-8B-Instruct"
# Parse command line arguments
# Extra arguments are passed through to the vLLM worker
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
echo "Options:"
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " -h, --help Show this help message"
echo ""
echo "Any additional arguments are passed through to the vLLM worker."
echo "Example: $0 --model Qwen/Qwen3-VL-8B-Instruct --dyn-tool-call-parser hermes"
exit 0
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT"
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp
# Start frontend with Rust OpenAIPreprocessor
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# ---- Per-model defaults ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
MODEL_EXTRA_ARGS=""
case "$MODEL_NAME" in
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)
MAX_MODEL_LEN="${MAX_MODEL_LEN:-108960}"
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL_NAME" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
# Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production)
# Extra args from command line come last to allow overrides
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Parse command-line arguments for request plane mode
REQUEST_PLANE="tcp" # Default to TCP
while [[ $# -gt 0 ]]; do
case $1 in
--tcp)
REQUEST_PLANE="tcp"
shift
;;
--http)
REQUEST_PLANE="http"
shift
;;
--nats)
REQUEST_PLANE="nats"
shift
;;
-h|--help)
echo "Usage: $0 [--tcp|--http|--nats]"
echo " --tcp Use TCP request plane (default)"
echo " --http Use HTTP/2 request plane"
echo " --nats Use NATS request plane"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Set the request plane mode
export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + Approximate KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events)
python -m dynamo.frontend \
--router-mode kv \
--no-kv-events &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-events-config '{"enable_kv_cache_events": false}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-events-config '{"enable_kv_cache_events": false}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# TODO: rename to agg_router_2gpu.sh (uses 2 GPUs) and update all references
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
#
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated serving on a single GPU.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh" # gpu_gb_to_total_fraction
source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
export VLLM_TARGET_DEVICE=xpu
# Default model
MODEL="Qwen/Qwen3-0.6B"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment