Unverified Commit dd7ceb4a authored by Yi Yao's avatar Yi Yao Committed by GitHub
Browse files

feat: add examples and unit tests for vLLM aggregated serving on XPU (#7146)


Signed-off-by: default avatarYi Yao <yi.a.yao@intel.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent c09ac697
......@@ -40,27 +40,36 @@ Each launch script runs the frontend and worker(s) in a single terminal. You can
The simplest deployment pattern: a single worker handles both prefill and decode. Requires 1 GPU.
Run on CUDA devices:
```bash
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg.sh
```
For XPU deployments, use a larger block size and set it to at least `64` (`>= 64`):
Run on XPUs:
```bash
# XeTLA ChunkPrefill FP8KV: only support block_size >= 64
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg.sh --block-size 64
bash launch/xpu/agg_xpu.sh
```
### Aggregated Serving with KV Routing
Two workers behind a [KV-aware router](../../components/router/README.md) that maximizes cache reuse. Requires 2 GPUs.
Run on CUDA devices:
```bash
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg_router.sh
```
Run on XPUs:
```bash
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/xpu/agg_router_xpu.sh
```
This launches the frontend in KV routing mode with two workers publishing KV events over ZMQ.
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
mkdir -p $DYN_LORA_PATH
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Aggregated + LoRA + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA to both instances (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT1}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " curl -s -X POST http://localhost:${SYSTEM_PORT2}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
echo " \"max_tokens\": 32}' | jq ."
echo ""
echo "=========================================="
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT2} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Sample output after running LoRA inference curl request twice.
# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
: <<'SAMPLE_OUTPUT'
{
"id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
"choices": [
{
"index": 0,
"message": {
"content": "<think>\nOkay, so I need to develop a character background for a character named Elara. Let me start by understanding the requirements. The user wants",
"role": "assistant",
"reasoning_content": null
},
"finish_reason": "length"
}
],
"created": 1765230243,
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"object": "chat.completion",
"usage": {
"prompt_tokens": 196,
"completion_tokens": 30,
"total_tokens": 226,
"prompt_tokens_details": {
"audio_tokens": null,
"cached_tokens": 192 # tokens that were cached from the previous request.
}
},
"nvext": {
"worker_id": {
"prefill_worker_id": 7587891281668871552,
"decode_worker_id": 7587891281668871552
}
}
}
SAMPLE_OUTPUT
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
export VLLM_TARGET_DEVICE=xpu
mkdir -p $DYN_LORA_PATH
MODEL="Qwen/Qwen3-0.6B"
SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Test base model inference (for comparison)"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Unload LoRA"
echo " curl -X DELETE http://localhost:${SYSTEM_PORT}/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
echo ""
echo "=========================================="
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var.
python -m dynamo.frontend &
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--enable-lora \
--max-lora-rank 64 &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
# Explicitly set PROMETHEUS_MULTIPROC_DIR (K8s-style deployment)
# Use unique directory per test run to avoid conflicts
export PROMETHEUS_MULTIPROC_DIR=${PROMETHEUS_MULTIPROC_DIR:-/tmp/prometheus_multiproc_$$_$RANDOM}
rm -rf "$PROMETHEUS_MULTIPROC_DIR"
mkdir -p "$PROMETHEUS_MULTIPROC_DIR"
# Cleanup function to remove the directory on exit
cleanup() {
echo "Cleaning up..."
rm -rf "$PROMETHEUS_MULTIPROC_DIR"
kill 0
}
trap cleanup EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
unset PROMETHEUS_MULTIPROC_DIR
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated multimodal serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Default values
MODEL_NAME="Qwen/Qwen3-VL-8B-Instruct"
# Parse command line arguments
# Extra arguments are passed through to the vLLM worker
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
echo "Options:"
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " -h, --help Show this help message"
echo ""
echo "Any additional arguments are passed through to the vLLM worker."
echo "Example: $0 --model Qwen/Qwen3-VL-8B-Instruct --dyn-tool-call-parser hermes"
exit 0
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT"
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp
# Start frontend with Rust OpenAIPreprocessor
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# ---- Per-model defaults ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
MODEL_EXTRA_ARGS=""
case "$MODEL_NAME" in
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)
MAX_MODEL_LEN="${MAX_MODEL_LEN:-108960}"
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL_NAME" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
# Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production)
# Extra args from command line come last to allow overrides
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Parse command-line arguments for request plane mode
REQUEST_PLANE="tcp" # Default to TCP
while [[ $# -gt 0 ]]; do
case $1 in
--tcp)
REQUEST_PLANE="tcp"
shift
;;
--http)
REQUEST_PLANE="http"
shift
;;
--nats)
REQUEST_PLANE="nats"
shift
;;
-h|--help)
echo "Usage: $0 [--tcp|--http|--nats]"
echo " --tcp Use TCP request plane (default)"
echo " --http Use HTTP/2 request plane"
echo " --nats Use NATS request plane"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Set the request plane mode
export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + Approximate KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events)
python -m dynamo.frontend \
--router-mode kv \
--no-kv-events &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-events-config '{"enable_kv_cache_events": false}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-events-config '{"enable_kv_cache_events": false}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# TODO: rename to agg_router_2gpu.sh (uses 2 GPUs) and update all references
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
#
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated serving on a single GPU.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh" # gpu_gb_to_total_fraction
source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
export VLLM_TARGET_DEVICE=xpu
# Default model
MODEL="Qwen/Qwen3-0.6B"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment