"deploy/observability/k8s/vscode:/vscode.git/clone" did not exist on "39d645e58647d6adb074650e46be5de25f3f3bc6"
Unverified Commit dd7ceb4a authored by Yi Yao's avatar Yi Yao Committed by GitHub
Browse files

feat: add examples and unit tests for vLLM aggregated serving on XPU (#7146)


Signed-off-by: default avatarYi Yao <yi.a.yao@intel.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent c09ac697
......@@ -40,27 +40,36 @@ Each launch script runs the frontend and worker(s) in a single terminal. You can
The simplest deployment pattern: a single worker handles both prefill and decode. Requires 1 GPU.
Run on CUDA devices:
```bash
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg.sh
```
For XPU deployments, use a larger block size and set it to at least `64` (`>= 64`):
Run on XPUs:
```bash
# XeTLA ChunkPrefill FP8KV: only support block_size >= 64
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg.sh --block-size 64
bash launch/xpu/agg_xpu.sh
```
### Aggregated Serving with KV Routing
Two workers behind a [KV-aware router](../../components/router/README.md) that maximizes cache reuse. Requires 2 GPUs.
Run on CUDA devices:
```bash
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/agg_router.sh
```
Run on XPUs:
```bash
cd $DYNAMO_HOME/examples/backends/vllm
bash launch/xpu/agg_router_xpu.sh
```
This launches the frontend in KV routing mode with two workers publishing KV events over ZMQ.
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
mkdir -p $DYN_LORA_PATH
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Aggregated + LoRA + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA to both instances (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT1}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " curl -s -X POST http://localhost:${SYSTEM_PORT2}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
echo " \"max_tokens\": 32}' | jq ."
echo ""
echo "=========================================="
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT2} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--enable-lora \
--max-lora-rank 64 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Sample output after running LoRA inference curl request twice.
# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
: <<'SAMPLE_OUTPUT'
{
"id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
"choices": [
{
"index": 0,
"message": {
"content": "<think>\nOkay, so I need to develop a character background for a character named Elara. Let me start by understanding the requirements. The user wants",
"role": "assistant",
"reasoning_content": null
},
"finish_reason": "length"
}
],
"created": 1765230243,
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"object": "chat.completion",
"usage": {
"prompt_tokens": 196,
"completion_tokens": 30,
"total_tokens": 226,
"prompt_tokens_details": {
"audio_tokens": null,
"cached_tokens": 192 # tokens that were cached from the previous request.
}
},
"nvext": {
"worker_id": {
"prefill_worker_id": 7587891281668871552,
"decode_worker_id": 7587891281668871552
}
}
}
SAMPLE_OUTPUT
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
export VLLM_TARGET_DEVICE=xpu
mkdir -p $DYN_LORA_PATH
MODEL="Qwen/Qwen3-0.6B"
SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo ""
echo " # Check available models"
echo " curl http://localhost:${HTTP_PORT}/v1/models | jq ."
echo ""
echo " # Load LoRA (using S3 URI)"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
echo ""
echo " # Test LoRA inference"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Test base model inference (for comparison)"
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${MODEL}\","
echo " \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
echo " \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
echo ""
echo " # Unload LoRA"
echo " curl -X DELETE http://localhost:${SYSTEM_PORT}/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
echo ""
echo "=========================================="
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var.
python -m dynamo.frontend &
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--enable-lora \
--max-lora-rank 64 &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
# Explicitly set PROMETHEUS_MULTIPROC_DIR (K8s-style deployment)
# Use unique directory per test run to avoid conflicts
export PROMETHEUS_MULTIPROC_DIR=${PROMETHEUS_MULTIPROC_DIR:-/tmp/prometheus_multiproc_$$_$RANDOM}
rm -rf "$PROMETHEUS_MULTIPROC_DIR"
mkdir -p "$PROMETHEUS_MULTIPROC_DIR"
# Cleanup function to remove the directory on exit
cleanup() {
echo "Cleaning up..."
rm -rf "$PROMETHEUS_MULTIPROC_DIR"
kill 0
}
trap cleanup EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
unset PROMETHEUS_MULTIPROC_DIR
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated multimodal serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Default values
MODEL_NAME="Qwen/Qwen3-VL-8B-Instruct"
# Parse command line arguments
# Extra arguments are passed through to the vLLM worker
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
echo "Options:"
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " -h, --help Show this help message"
echo ""
echo "Any additional arguments are passed through to the vLLM worker."
echo "Example: $0 --model Qwen/Qwen3-VL-8B-Instruct --dyn-tool-call-parser hermes"
exit 0
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT"
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp
# Start frontend with Rust OpenAIPreprocessor
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# ---- Per-model defaults ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
MODEL_EXTRA_ARGS=""
case "$MODEL_NAME" in
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)
MAX_MODEL_LEN="${MAX_MODEL_LEN:-108960}"
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL_NAME" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
# Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production)
# Extra args from command line come last to allow overrides
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Parse command-line arguments for request plane mode
REQUEST_PLANE="tcp" # Default to TCP
while [[ $# -gt 0 ]]; do
case $1 in
--tcp)
REQUEST_PLANE="tcp"
shift
;;
--http)
REQUEST_PLANE="http"
shift
;;
--nats)
REQUEST_PLANE="nats"
shift
;;
-h|--help)
echo "Usage: $0 [--tcp|--http|--nats]"
echo " --tcp Use TCP request plane (default)"
echo " --http Use HTTP/2 request plane"
echo " --nats Use NATS request plane"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Set the request plane mode
export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
python -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + Approximate KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events)
python -m dynamo.frontend \
--router-mode kv \
--no-kv-events &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-events-config '{"enable_kv_cache_events": false}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--kv-events-config '{"enable_kv_cache_events": false}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# TODO: rename to agg_router_2gpu.sh (uses 2 GPUs) and update all references
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
# Set deterministic hash for KV event IDs
export PYTHONHASHSEED=0
export VLLM_TARGET_DEVICE=xpu
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
BLOCK_SIZE=64
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--router-reset-states &
# run workers
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
#
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated serving on a single GPU.
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh" # gpu_gb_to_total_fraction
source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
export VLLM_TARGET_DEVICE=xpu
# Default model
MODEL="Qwen/Qwen3-0.6B"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \
${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import base64
import dataclasses
import logging
import os
import random
from dataclasses import dataclass, field
from typing import Optional
import pytest
from tests.serve.common import (
WORKSPACE_DIR,
params_with_model_mark,
run_serve_deployment,
)
from tests.serve.conftest import MULTIMODAL_IMG_URL, get_multimodal_test_image_bytes
from tests.serve.lora_utils import MinioLoraConfig
from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
cached_tokens_chat_payload,
chat_payload,
chat_payload_default,
chat_payload_with_logprobs,
completion_payload_default,
completion_payload_with_logprobs,
metric_payload_default,
)
from tests.utils.payloads import LoraTestChatPayload, ToolCallingChatPayload
logger = logging.getLogger(__name__)
@dataclass
class VLLMConfig(EngineConfig):
"""Configuration for vLLM test scenarios"""
stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/vllm"
)
# vLLM test configurations
vllm_configs = {
"aggregated": VLLMConfig(
name="aggregated_xpu",
directory=vllm_dir,
script_name="xpu/agg_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(300), # 3x measured time (43s) + download time (150s)
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
completion_payload_default(),
chat_payload(
"Can you write me a song?",
repeat_count=1,
expected_response=["song"],
temperature=0.0,
max_tokens=32,
extra_body={
"stop": ["song"],
"include_stop_str_in_output": True,
},
),
metric_payload_default(min_num_requests=6, backend="vllm"),
],
),
"aggregated_logprobs": VLLMConfig(
name="aggregated_logprobs_xpu",
directory=vllm_dir,
script_name="xpu/agg_xpu.sh",
marks=[pytest.mark.xpu_1, pytest.mark.post_merge],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_with_logprobs(
repeat_count=2,
expected_response=["AI", "knock", "joke"],
max_tokens=30,
temperature=0.0,
top_logprobs=3,
),
completion_payload_with_logprobs(
repeat_count=2,
expected_response=["AI", "knock", "joke"],
max_tokens=30,
temperature=0.0,
logprobs=5,
),
],
),
"aggregated_lmcache": VLLMConfig(
name="aggregated_lmcache_xpu",
directory=vllm_dir,
script_name="xpu/agg_lmcache_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(360), # 3x estimated time (70s) + download time (150s)
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
completion_payload_default(),
metric_payload_default(min_num_requests=6, backend="vllm"),
metric_payload_default(min_num_requests=6, backend="lmcache"),
],
),
"aggregated_lmcache_multiproc": VLLMConfig(
name="aggregated_lmcache_multiproc_xpu",
directory=vllm_dir,
script_name="xpu/agg_lmcache_multiproc_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(360), # 3x estimated time (70s) + download time (150s)
],
model="Qwen/Qwen3-0.6B",
env={
"PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}",
},
request_payloads=[
chat_payload_default(),
completion_payload_default(),
metric_payload_default(min_num_requests=6, backend="vllm"),
metric_payload_default(min_num_requests=6, backend="lmcache"),
],
),
"agg-request-plane-tcp": VLLMConfig(
name="agg-request-plane-tcp-xpu",
directory=vllm_dir,
script_name="xpu/agg_request_planes_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(300), # 3x measured time (43s) + download time (150s)
],
model="Qwen/Qwen3-0.6B",
script_args=["--tcp"],
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
"agg-request-plane-http": VLLMConfig(
name="agg-request-plane-http-xpu",
directory=vllm_dir,
script_name="xpu/agg_request_planes_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(300), # 3x measured time (43s) + download time (150s)
],
model="Qwen/Qwen3-0.6B",
script_args=["--http"],
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
"agg-router": VLLMConfig(
name="agg-router-xpu",
directory=vllm_dir,
script_name="xpu/agg_router_xpu.sh",
marks=[
pytest.mark.xpu_2,
pytest.mark.post_merge,
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(
expected_log=[
r"ZMQ listener .* received batch with \d+ events \(engine_seq=\d+(?:, [^)]*)?\)",
r"Event processor for worker_id \d+ processing event: Stored\(",
r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
]
)
],
env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
},
),
"agg-router-approx": VLLMConfig(
name="agg-router-approx-xpu",
directory=vllm_dir,
script_name="xpu/agg_router_approx_xpu.sh",
marks=[
pytest.mark.xpu_2,
pytest.mark.post_merge,
pytest.mark.skip(reason="DYN-2264"),
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
# Test approximate KV routing (--no-kv-events mode)
# Repeated requests should show cache-aware routing in logs
chat_payload_default(
repeat_count=3,
expected_log=[
# Verify scheduler is selecting workers with cache awareness
r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
# After first request, should see cached blocks being tracked
r"with \d+ cached blocks",
],
),
# Also test with cached tokens payload to verify usage field
cached_tokens_chat_payload(
repeat_count=3,
expected_log=[
# Verify routing decision shows cache hits
r"with \d+ cached blocks",
],
),
],
env={
"DYN_LOG": "dynamo_kv_router::scheduling::selector=debug",
},
),
"multimodal_agg_frontend_decoding": VLLMConfig(
name="multimodal_agg_frontend_decoding_xpu",
directory=vllm_dir,
script_name="xpu/agg_multimodal_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.skip("skip for XPU"),
],
model="Qwen/Qwen2-VL-2B-Instruct",
# Pass --frontend-decoding to enable Rust frontend image decoding + NIXL RDMA transfer
script_args=[
"--model",
"Qwen/Qwen2-VL-2B-Instruct",
"--frontend-decoding",
],
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_agg_qwen": VLLMConfig(
name="multimodal_agg_qwen_xpu",
directory=vllm_dir,
script_name="xpu/agg_multimodal_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.skip(reason="skip for XPU"),
],
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0,
timeout=360,
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["Green, White"],
max_tokens=100,
),
],
),
"multimodal_agg_llava": VLLMConfig(
name="multimodal_agg_llava_xpu",
directory=vllm_dir,
script_name="xpu/agg_multimodal_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.nightly,
# https://github.com/ai-dynamo/dynamo/issues/4501
pytest.mark.xfail(strict=False),
],
model="llava-hf/llava-1.5-7b-hf",
script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
delayed_start=0,
timeout=360,
request_payloads=[
# HTTP URL test
chat_payload(
[
{"type": "text", "text": "What is in this image?"},
{
"type": "image_url",
"image_url": {
"url": "http://images.cocodataset.org/test2017/000000155781.jpg"
},
},
],
repeat_count=1,
expected_response=["bus"],
temperature=0.0,
),
# String content test - verifies string → array conversion for multimodal templates
chat_payload_default(
repeat_count=1,
expected_response=[], # Just validate no error
),
],
),
"aggregated_toolcalling": VLLMConfig(
name="aggregated_toolcalling_xpu",
directory=vllm_dir,
script_name="xpu/agg_multimodal_xpu.sh",
marks=[
pytest.mark.xpu_2,
pytest.mark.multimodal,
pytest.mark.nightly,
pytest.mark.skip(reason="skip for XPU"),
],
model="Qwen/Qwen3-VL-8B-Instruct",
script_args=[
"--model",
"Qwen/Qwen3-VL-8B-Instruct",
"--max-model-len",
"10000",
"--dyn-tool-call-parser",
"hermes",
],
delayed_start=0,
timeout=600,
request_payloads=[
ToolCallingChatPayload(
body={
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe what you see in this image in detail.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
}
],
"tools": [
{
"type": "function",
"function": {
"name": "describe_image",
"description": "Provides detailed description of objects and scenes in an image",
"parameters": {
"type": "object",
"properties": {
"objects": {
"type": "array",
"items": {"type": "string"},
"description": "List of objects detected in the image",
},
"scene": {
"type": "string",
"description": "Overall scene description",
},
},
"required": ["objects", "scene"],
},
},
}
],
"tool_choice": "auto",
"max_tokens": 1024,
},
repeat_count=1,
expected_response=["green"], # Validate image understanding
expected_log=[],
expected_tool_name="describe_image", # Validate tool call happened
)
],
),
"completions_only": VLLMConfig(
name="completions_only_xpu",
directory=vllm_dir,
script_name="xpu/agg_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.post_merge,
pytest.mark.skip(reason="skip for XPU"),
pytest.mark.timeout(
420
), # 3x estimated time (60s) + download time (240s) for 7B model
],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
"--model",
"deepseek-ai/deepseek-llm-7b-base",
"--dyn-endpoint-types",
"completions",
],
request_payloads=[
completion_payload_default(),
],
),
"guided_decoding": VLLMConfig(
name="guided_decoding_xpu",
directory=vllm_dir,
script_name="xpu/agg_xpu.sh",
marks=[pytest.mark.xpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload(
"Generate a person with name and age",
repeat_count=1,
expected_response=['"name"', '"age"'],
temperature=0.0,
max_tokens=100,
extra_body={
"guided_json": {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
},
"required": ["name", "age"],
}
},
),
chat_payload(
"Generate a color name (red, blue, or green)",
repeat_count=1,
expected_response=["red", "blue", "green"],
temperature=0.0,
max_tokens=20,
extra_body={"guided_regex": r"(red|blue|green)"},
),
chat_payload(
"Generate a color name (red, blue, or green)",
repeat_count=1,
expected_response=["red", "blue", "green"],
temperature=0.0,
max_tokens=20,
extra_body={"guided_choice": ["red", "blue", "green"]},
),
],
),
}
@pytest.fixture(params=params_with_model_mark(vllm_configs))
def vllm_config_test(request):
"""Fixture that provides different vLLM test configurations"""
return vllm_configs[request.param]
@pytest.mark.vllm
@pytest.mark.e2e
def test_serve_deployment(
vllm_config_test,
request,
runtime_services_dynamic_ports,
dynamo_dynamic_ports,
predownload_models,
image_server,
):
"""
Test dynamo serve deployments with different graph configurations.
"""
config = dataclasses.replace(
vllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.xpu_1
@pytest.mark.nightly
@pytest.mark.skip(reason="skip for XPU")
@pytest.mark.timeout(360) # Match VLLMConfig.timeout for this multimodal deployment
def test_multimodal_b64(
request,
runtime_services_dynamic_ports,
dynamo_dynamic_ports,
predownload_models,
):
"""
Test multimodal inference with base64 url passthrough.
This test is separate because it loads the required image at runtime
(not collection time), ensuring it only fails when actually executed.
"""
# Load B64 image at test execution time (uses real PNG even if MULTIMODAL_IMG is LFS pointer)
b64_img = base64.b64encode(get_multimodal_test_image_bytes()).decode()
# Create payload with B64 image
b64_payload = chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64_img}"},
},
],
repeat_count=1,
expected_response=["Green, White"],
max_tokens=100,
)
# Create test config
config = VLLMConfig(
name="test_multimodal_b64_xpu",
directory=vllm_dir,
script_name="xpu/agg_multimodal_xpu.sh",
marks=[], # markers at function-level
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0,
timeout=360,
request_payloads=[b64_payload],
)
config = dataclasses.replace(
config, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
# LoRA Test Directory
lora_dir = os.path.join(vllm_dir, "launch/lora")
def lora_chat_payload(
lora_name: str,
s3_uri: str,
system_port: int = DefaultPort.SYSTEM1.value,
repeat_count: int = 2,
expected_response: Optional[list] = None,
expected_log: Optional[list] = None,
max_tokens: int = 100,
temperature: float = 0.0,
) -> LoraTestChatPayload:
"""Create a LoRA-enabled chat payload for testing"""
return LoraTestChatPayload(
body={
"model": lora_name,
"messages": [
{
"role": "user",
"content": "What is deep learning? Answer in one sentence.",
}
],
"max_tokens": max_tokens,
"temperature": temperature,
"stream": False,
},
lora_name=lora_name,
s3_uri=s3_uri,
system_port=system_port,
repeat_count=repeat_count,
expected_response=expected_response
or ["learning", "neural", "network", "AI", "model"],
expected_log=expected_log or [],
)
@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.xpu_1
@pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600)
@pytest.mark.skip(reason="skip for XPU")
@pytest.mark.post_merge
def test_lora_aggregated(
request,
runtime_services_dynamic_ports,
predownload_models,
minio_lora_service,
dynamo_dynamic_ports,
):
"""
Test LoRA inference with aggregated vLLM deployment.
This test:
1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
2. Starts vLLM with LoRA support enabled
3. Loads the LoRA adapter via system API
4. Runs inference with the LoRA model
"""
minio_config: MinioLoraConfig = minio_lora_service
# Create payload that loads LoRA and tests inference
lora_payload = lora_chat_payload(
lora_name=minio_config.lora_name,
s3_uri=minio_config.get_s3_uri(),
system_port=DefaultPort.SYSTEM1.value,
repeat_count=2,
)
# Create test config with MinIO environment variables
env_vars = minio_config.get_env_vars()
config = VLLMConfig(
name="test_lora_aggregated_xpu",
directory=vllm_dir,
script_name="lora/xpu/agg_lora_xpu.sh",
marks=[], # markers at function-level
model="Qwen/Qwen3-0.6B",
timeout=600,
env=env_vars,
request_payloads=[lora_payload],
)
config = dataclasses.replace(
config, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(
config,
request,
ports=dynamo_dynamic_ports,
extra_env=env_vars,
)
@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.xpu_2
@pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600)
@pytest.mark.skip(reason="skip for XPU")
@pytest.mark.post_merge
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
def test_lora_aggregated_router(
request,
minio_lora_service,
dynamo_dynamic_ports,
num_system_ports,
):
"""
Test LoRA inference with aggregated vLLM deployment using KV router.
This test:
1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
2. Starts multiple vLLM workers with LoRA support and KV router
3. Loads the LoRA adapter on both workers via system API
4. Runs inference with the LoRA model, verifying KV cache routing
"""
assert (
num_system_ports >= 2
), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
minio_config: MinioLoraConfig = minio_lora_service
# Create payloads that load LoRA on both workers and test inference
# Worker 1 (DefaultPort.SYSTEM1)
lora_payload_worker1 = lora_chat_payload(
lora_name=minio_config.lora_name,
s3_uri=minio_config.get_s3_uri(),
system_port=DefaultPort.SYSTEM1.value,
repeat_count=1,
)
# Worker 2 (DefaultPort.SYSTEM2)
lora_payload_worker2 = lora_chat_payload(
lora_name=minio_config.lora_name,
s3_uri=minio_config.get_s3_uri(),
system_port=DefaultPort.SYSTEM2.value,
repeat_count=1,
)
# Additional inference payload to test routing (LoRA already loaded)
inference_payload = chat_payload(
content="Explain machine learning in simple terms.",
repeat_count=2,
expected_response=["learn", "data", "algorithm", "model", "pattern"],
max_tokens=150,
temperature=0.0,
).with_model(minio_config.lora_name)
# Add env vars including PYTHONHASHSEED for deterministic KV event IDs
env_vars = minio_config.get_env_vars()
env_vars["PYTHONHASHSEED"] = "0"
# Create test config with MinIO environment variables
config = VLLMConfig(
name="test_lora_aggregated_router_xpu",
directory=vllm_dir,
script_name="lora/xpu/agg_lora_router_xpu.sh",
marks=[], # markers at function-level
model="Qwen/Qwen3-0.6B",
timeout=600,
env=env_vars,
request_payloads=[
lora_payload_worker1,
lora_payload_worker2,
inference_payload,
],
)
config = dataclasses.replace(
config, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(
config, request, ports=dynamo_dynamic_ports, extra_env=env_vars
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment