feat: add examples and unit tests for vLLM aggregated serving on XPU (#7146)

Signed-off-by: Yi Yao <yi.a.yao@intel.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

feat: add examples and unit tests for vLLM aggregated serving on XPU (#7146)
Signed-off-by: Yi Yao <yi.a.yao@intel.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
dd7ceb4a · Yi Yao · GitHub · c09ac697 · dd7ceb4a · dd7ceb4a
Unverified Commit dd7ceb4a authored Apr 02, 2026 by Yi Yao Committed by GitHub Apr 02, 2026
11 changed files
--- a/docs/backends/vllm/vllm-examples.md
+++ b/docs/backends/vllm/vllm-examples.md
@@ -40,27 +40,36 @@ Each launch script runs the frontend and worker(s) in a single terminal. You can

 The simplest deployment pattern: a single worker handles both prefill and decode. Requires 1 GPU.

+Run on CUDA devices:
+
 ```bash
 cd $DYNAMO_HOME/examples/backends/vllm
 bash launch/agg.sh
 ```

-For XPU deployments, use a larger block size and set it to at least `64` (`>= 64`):
+Run on XPUs:

 ```bash
-# XeTLA ChunkPrefill FP8KV: only support block_size >= 64
 cd $DYNAMO_HOME/examples/backends/vllm
-bash launch/agg.sh --block-size 64
+bash launch/xpu/agg_xpu.sh
 ```

 ### Aggregated Serving with KV Routing

 Two workers behind a [KV-aware router](../../components/router/README.md) that maximizes cache reuse. Requires 2 GPUs.

+Run on CUDA devices:
+
 ```bash
 cd $DYNAMO_HOME/examples/backends/vllm
 bash launch/agg_router.sh
 ```
+Run on XPUs:
+```bash
+cd $DYNAMO_HOME/examples/backends/vllm
+bash launch/xpu/agg_router_xpu.sh
+```
+

 This launches the frontend in KV routing mode with two workers publishing KV events over ZMQ.


--- a/examples/backends/vllm/launch/lora/xpu/agg_lora_router_xpu.sh
+++ b/examples/backends/vllm/launch/lora/xpu/agg_lora_router_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
+
+export AWS_ENDPOINT=http://localhost:9000
+export AWS_ACCESS_KEY_ID=minioadmin
+export AWS_SECRET_ACCESS_KEY=minioadmin
+export AWS_REGION=us-east-1
+export AWS_ALLOW_HTTP=true
+
+# Dynamo LoRA Configuration
+export DYN_LORA_ENABLED=true
+export DYN_LORA_PATH=/tmp/dynamo_loras_minio
+
+mkdir -p $DYN_LORA_PATH
+
+# Set deterministic hash for KV event IDs
+export PYTHONHASHSEED=0
+
+export VLLM_TARGET_DEVICE=xpu
+
+# Common configuration
+MODEL="Qwen/Qwen3-0.6B"
+BLOCK_SIZE=64
+
+SYSTEM_PORT1="${DYN_SYSTEM_PORT1:-8081}"
+SYSTEM_PORT2="${DYN_SYSTEM_PORT2:-8082}"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --no-curl "Launching Aggregated + LoRA + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
+echo ""
+echo "Once running, test with:"
+echo ""
+echo "  # Check available models"
+echo "  curl http://localhost:${HTTP_PORT}/v1/models | jq ."
+echo ""
+echo "  # Load LoRA to both instances (using S3 URI)"
+echo "  curl -s -X POST http://localhost:${SYSTEM_PORT1}/v1/loras \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
+echo ""
+echo "  curl -s -X POST http://localhost:${SYSTEM_PORT2}/v1/loras \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
+echo ""
+echo "  # Test LoRA inference"
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+echo "         \"max_tokens\": 32}' | jq ."
+echo ""
+echo "=========================================="
+
+# run frontend + KV router
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python -m dynamo.frontend \
+    --router-mode kv \
+    --router-reset-states &
+
+# run workers
+# --enforce-eager is added for quick deployment. for production use, need to remove this flag
+# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT1} \
+ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
+    --model $MODEL \
+    --block-size $BLOCK_SIZE \
+    --enforce-eager \
+    --enable-lora \
+    --max-lora-rank 64 \
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
+
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT2} \
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
+ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
+    --model $MODEL \
+    --block-size $BLOCK_SIZE \
+    --enforce-eager \
+    --enable-lora \
+    --max-lora-rank 64 \
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
+
+# Sample output after running LoRA inference curl request twice.
+# usage.prompt_tokens_details.cached_tokens is the number of tokens that were cached from the previous request.
+: <<'SAMPLE_OUTPUT'
+{
+  "id": "chatcmpl-0cf880c2-fe98-45c4-9c76-84c3ad1a56cc",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "content": "<think>\nOkay, so I need to develop a character background for a character named Elara. Let me start by understanding the requirements. The user wants",
+        "role": "assistant",
+        "reasoning_content": null
+      },
+      "finish_reason": "length"
+    }
+  ],
+  "created": 1765230243,
+  "model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
+  "object": "chat.completion",
+  "usage": {
+    "prompt_tokens": 196,
+    "completion_tokens": 30,
+    "total_tokens": 226,
+    "prompt_tokens_details": {
+      "audio_tokens": null,
+      "cached_tokens": 192              # tokens that were cached from the previous request.
+    }
+  },
+  "nvext": {
+    "worker_id": {
+      "prefill_worker_id": 7587891281668871552,
+      "decode_worker_id": 7587891281668871552
+    }
+  }
+}
+SAMPLE_OUTPUT
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/lora/xpu/agg_lora_xpu.sh
+++ b/examples/backends/vllm/launch/lora/xpu/agg_lora_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../../common/launch_utils.sh"
+
+export AWS_ENDPOINT=http://localhost:9000
+export AWS_ACCESS_KEY_ID=minioadmin
+export AWS_SECRET_ACCESS_KEY=minioadmin
+export AWS_REGION=us-east-1
+export AWS_ALLOW_HTTP=true
+
+# Dynamo LoRA Configuration
+export DYN_LORA_ENABLED=true
+export DYN_LORA_PATH=/tmp/dynamo_loras_minio
+
+export VLLM_TARGET_DEVICE=xpu
+
+mkdir -p $DYN_LORA_PATH
+
+MODEL="Qwen/Qwen3-0.6B"
+
+SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
+echo ""
+echo "Once running, test with:"
+echo ""
+echo "  # Check available models"
+echo "  curl http://localhost:${HTTP_PORT}/v1/models | jq ."
+echo ""
+echo "  # Load LoRA (using S3 URI)"
+echo "  curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"lora_name\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"source\": {\"uri\": \"s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora\"}}' | jq ."
+echo ""
+echo "  # Test LoRA inference"
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"model\": \"codelion/Qwen3-0.6B-accuracy-recovery-lora\","
+echo "         \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
+echo "         \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
+echo ""
+echo "  # Test base model inference (for comparison)"
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"model\": \"${MODEL}\","
+echo "         \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}],"
+echo "         \"max_tokens\": 300, \"temperature\": 0.0}' | jq ."
+echo ""
+echo "  # Unload LoRA"
+echo "  curl -X DELETE http://localhost:${SYSTEM_PORT}/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
+echo ""
+echo "=========================================="
+
+# run ingress
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var.
+python -m dynamo.frontend &
+
+# ---- Tunable (override via env vars) ----
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+
+GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
+    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
+    --block-size "${BLOCK_SIZE:-64}" \
+    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
+    --enable-lora \
+    --max-lora-rank 64 &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_lmcache_multiproc_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_lmcache_multiproc_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+
+# Explicitly set PROMETHEUS_MULTIPROC_DIR (K8s-style deployment)
+# Use unique directory per test run to avoid conflicts
+export PROMETHEUS_MULTIPROC_DIR=${PROMETHEUS_MULTIPROC_DIR:-/tmp/prometheus_multiproc_$$_$RANDOM}
+rm -rf "$PROMETHEUS_MULTIPROC_DIR"
+mkdir -p "$PROMETHEUS_MULTIPROC_DIR"
+
+# Cleanup function to remove the directory on exit
+cleanup() {
+    echo "Cleaning up..."
+    rm -rf "$PROMETHEUS_MULTIPROC_DIR"
+    kill 0
+}
+trap cleanup EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+export VLLM_TARGET_DEVICE=xpu
+
+MODEL="Qwen/Qwen3-0.6B"
+
+# ---- Tunable (override via env vars) ----
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+
+GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
+
+python -m dynamo.frontend &
+
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+  PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
+  python -m dynamo.vllm --model "$MODEL" --enforce-eager \
+  --max-model-len "$MAX_MODEL_LEN" \
+  --max-num-seqs "$MAX_CONCURRENT_SEQS" \
+  --block-size "${BLOCK_SIZE:-64}" \
+  ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
+  --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_lmcache_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_lmcache_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
+unset PROMETHEUS_MULTIPROC_DIR
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+export VLLM_TARGET_DEVICE=xpu
+
+MODEL="Qwen/Qwen3-0.6B"
+
+# ---- Tunable (override via env vars) ----
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+
+GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
+
+python -m dynamo.frontend &
+
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+  python -m dynamo.vllm --model "$MODEL" --enforce-eager \
+  --max-model-len "$MAX_MODEL_LEN" \
+  --max-num-seqs "$MAX_CONCURRENT_SEQS" \
+  --block-size "${BLOCK_SIZE:-64}" \
+  ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} \
+  --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both","kv_buffer_device":"xpu"}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated multimodal serving with standard Dynamo preprocessing
+#
+# Architecture: Single-worker PD (Prefill-Decode)
+# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
+# - Worker: Standard vLLM worker with vision model support
+#
+# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
+# see agg_multimodal_epd.sh
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+export VLLM_TARGET_DEVICE=xpu
+
+# Default values
+MODEL_NAME="Qwen/Qwen3-VL-8B-Instruct"
+
+# Parse command line arguments
+# Extra arguments are passed through to the vLLM worker
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME=$2
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
+            echo "Options:"
+            echo "  --model <model_name>   Specify the VLM model to use (default: $MODEL_NAME)"
+            echo "  -h, --help             Show this help message"
+            echo ""
+            echo "Any additional arguments are passed through to the vLLM worker."
+            echo "Example: $0 --model Qwen/Qwen3-VL-8B-Instruct --dyn-tool-call-parser hermes"
+            exit 0
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT"
+
+# Use TCP transport (instead of default NATS)
+# TCP is preferred for multimodal workloads because it overcomes:
+# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
+export DYN_REQUEST_PLANE=tcp
+
+# Start frontend with Rust OpenAIPreprocessor
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python -m dynamo.frontend &
+
+# ---- Per-model defaults ----
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+MODEL_EXTRA_ARGS=""
+case "$MODEL_NAME" in
+    meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)
+        MAX_MODEL_LEN="${MAX_MODEL_LEN:-108960}"
+        MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
+esac
+
+GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL_NAME" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+
+# Start vLLM worker with vision model
+# --enforce-eager: Quick deployment (remove for production)
+# Extra args from command line come last to allow overrides
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
+    --block-size "${BLOCK_SIZE:-64}" \
+    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_request_planes_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_request_planes_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+export VLLM_TARGET_DEVICE=xpu
+
+# Parse command-line arguments for request plane mode
+REQUEST_PLANE="tcp"  # Default to TCP
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --tcp)
+            REQUEST_PLANE="tcp"
+            shift
+            ;;
+        --http)
+            REQUEST_PLANE="http"
+            shift
+            ;;
+        --nats)
+            REQUEST_PLANE="nats"
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [--tcp|--http|--nats]"
+            echo "  --tcp   Use TCP request plane (default)"
+            echo "  --http  Use HTTP/2 request plane"
+            echo "  --nats  Use NATS request plane"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+MODEL="Qwen/Qwen3-0.6B"
+
+# ---- Tunable (override via env vars) ----
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+
+# Set the request plane mode
+export DYN_REQUEST_PLANE=$REQUEST_PLANE
+echo "Using request plane mode: $REQUEST_PLANE"
+
+GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
+
+python -m dynamo.frontend &
+
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+DYN_HEALTH_CHECK_ENABLED=true \
+    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
+    --block-size "${BLOCK_SIZE:-64}" \
+    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_router_approx_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_router_approx_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+export VLLM_TARGET_DEVICE=xpu
+
+# Common configuration
+MODEL="Qwen/Qwen3-0.6B"
+BLOCK_SIZE=64
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner "Launching Aggregated + Approximate KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
+
+# run frontend with KV router (--router-mode kv) in approximate mode (--no-kv-events)
+python -m dynamo.frontend \
+    --router-mode kv \
+    --no-kv-events &
+
+# run workers
+# --enforce-eager is added for quick deployment. for production use, need to remove this flag
+#
+# If multiple workers are launched, they must not share the same system/metrics port.
+# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
+# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
+
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
+ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
+    --model $MODEL \
+    --block-size $BLOCK_SIZE \
+    --kv-events-config '{"enable_kv_cache_events": false}' &
+
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
+ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
+    --model $MODEL \
+    --block-size $BLOCK_SIZE \
+    --kv-events-config '{"enable_kv_cache_events": false}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_router_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_router_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: rename to agg_router_2gpu.sh (uses 2 GPUs) and update all references
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+# Set deterministic hash for KV event IDs
+export PYTHONHASHSEED=0
+
+export VLLM_TARGET_DEVICE=xpu
+
+# Common configuration
+MODEL="Qwen/Qwen3-0.6B"
+BLOCK_SIZE=64
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
+
+# run frontend + KV router
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python -m dynamo.frontend \
+    --router-mode kv \
+    --router-reset-states &
+
+# run workers
+# --enforce-eager is added for quick deployment. for production use, need to remove this flag
+#
+# If multiple workers are launched, they must not share the same system/metrics port.
+# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
+# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
+#
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
+ZE_AFFINITY_MASK=0 python3 -m dynamo.vllm \
+    --model $MODEL \
+    --block-size $BLOCK_SIZE \
+    --enforce-eager \
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
+
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
+ZE_AFFINITY_MASK=1 python3 -m dynamo.vllm \
+    --model $MODEL \
+    --block-size $BLOCK_SIZE \
+    --enforce-eager \
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/examples/backends/vllm/launch/xpu/agg_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated serving on a single GPU.
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"   # gpu_gb_to_total_fraction
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
+
+export VLLM_TARGET_DEVICE=xpu
+
+# Default model
+MODEL="Qwen/Qwen3-0.6B"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+# ---- Tunable (override via env vars) ----
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
+MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+
+GPU_MEM_FRACTION=$(build_gpu_mem_args vllm --model "$MODEL" --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_CONCURRENT_SEQS")
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
+
+# run ingress
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python -m dynamo.frontend &
+
+# run worker
+# --enforce-eager is added for quick deployment. for production use, need to remove this flag
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
+    --block-size "${BLOCK_SIZE:-64}" \
+    ${GPU_MEM_FRACTION:+--gpu-memory-utilization "$GPU_MEM_FRACTION"} "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/tests/serve/test_vllm_xpu.py
+++ b/tests/serve/test_vllm_xpu.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import dataclasses
+import logging
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Optional
+
+import pytest
+
+from tests.serve.common import (
+    WORKSPACE_DIR,
+    params_with_model_mark,
+    run_serve_deployment,
+)
+from tests.serve.conftest import MULTIMODAL_IMG_URL, get_multimodal_test_image_bytes
+from tests.serve.lora_utils import MinioLoraConfig
+from tests.utils.constants import DefaultPort
+from tests.utils.engine_process import EngineConfig
+from tests.utils.payload_builder import (
+    cached_tokens_chat_payload,
+    chat_payload,
+    chat_payload_default,
+    chat_payload_with_logprobs,
+    completion_payload_default,
+    completion_payload_with_logprobs,
+    metric_payload_default,
+)
+from tests.utils.payloads import LoraTestChatPayload, ToolCallingChatPayload
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class VLLMConfig(EngineConfig):
+    """Configuration for vLLM test scenarios"""
+
+    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
+
+
+vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
+    WORKSPACE_DIR, "examples/backends/vllm"
+)
+
+
+# vLLM test configurations
+vllm_configs = {
+    "aggregated": VLLMConfig(
+        name="aggregated_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(300),  # 3x measured time (43s) + download time (150s)
+        ],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            chat_payload(
+                "Can you write me a song?",
+                repeat_count=1,
+                expected_response=["song"],
+                temperature=0.0,
+                max_tokens=32,
+                extra_body={
+                    "stop": ["song"],
+                    "include_stop_str_in_output": True,
+                },
+            ),
+            metric_payload_default(min_num_requests=6, backend="vllm"),
+        ],
+    ),
+    "aggregated_logprobs": VLLMConfig(
+        name="aggregated_logprobs_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_xpu.sh",
+        marks=[pytest.mark.xpu_1, pytest.mark.post_merge],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload_with_logprobs(
+                repeat_count=2,
+                expected_response=["AI", "knock", "joke"],
+                max_tokens=30,
+                temperature=0.0,
+                top_logprobs=3,
+            ),
+            completion_payload_with_logprobs(
+                repeat_count=2,
+                expected_response=["AI", "knock", "joke"],
+                max_tokens=30,
+                temperature=0.0,
+                logprobs=5,
+            ),
+        ],
+    ),
+    "aggregated_lmcache": VLLMConfig(
+        name="aggregated_lmcache_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_lmcache_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
+        ],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            metric_payload_default(min_num_requests=6, backend="vllm"),
+            metric_payload_default(min_num_requests=6, backend="lmcache"),
+        ],
+    ),
+    "aggregated_lmcache_multiproc": VLLMConfig(
+        name="aggregated_lmcache_multiproc_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_lmcache_multiproc_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
+        ],
+        model="Qwen/Qwen3-0.6B",
+        env={
+            "PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}",
+        },
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            metric_payload_default(min_num_requests=6, backend="vllm"),
+            metric_payload_default(min_num_requests=6, backend="lmcache"),
+        ],
+    ),
+    "agg-request-plane-tcp": VLLMConfig(
+        name="agg-request-plane-tcp-xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_request_planes_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(300),  # 3x measured time (43s) + download time (150s)
+        ],
+        model="Qwen/Qwen3-0.6B",
+        script_args=["--tcp"],
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+        ],
+    ),
+    "agg-request-plane-http": VLLMConfig(
+        name="agg-request-plane-http-xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_request_planes_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(300),  # 3x measured time (43s) + download time (150s)
+        ],
+        model="Qwen/Qwen3-0.6B",
+        script_args=["--http"],
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+        ],
+    ),
+    "agg-router": VLLMConfig(
+        name="agg-router-xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_router_xpu.sh",
+        marks=[
+            pytest.mark.xpu_2,
+            pytest.mark.post_merge,
+        ],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload_default(
+                expected_log=[
+                    r"ZMQ listener .* received batch with \d+ events \(engine_seq=\d+(?:, [^)]*)?\)",
+                    r"Event processor for worker_id \d+ processing event: Stored\(",
+                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
+                ]
+            )
+        ],
+        env={
+            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
+        },
+    ),
+    "agg-router-approx": VLLMConfig(
+        name="agg-router-approx-xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_router_approx_xpu.sh",
+        marks=[
+            pytest.mark.xpu_2,
+            pytest.mark.post_merge,
+            pytest.mark.skip(reason="DYN-2264"),
+        ],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            # Test approximate KV routing (--no-kv-events mode)
+            # Repeated requests should show cache-aware routing in logs
+            chat_payload_default(
+                repeat_count=3,
+                expected_log=[
+                    # Verify scheduler is selecting workers with cache awareness
+                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
+                    # After first request, should see cached blocks being tracked
+                    r"with \d+ cached blocks",
+                ],
+            ),
+            # Also test with cached tokens payload to verify usage field
+            cached_tokens_chat_payload(
+                repeat_count=3,
+                expected_log=[
+                    # Verify routing decision shows cache hits
+                    r"with \d+ cached blocks",
+                ],
+            ),
+        ],
+        env={
+            "DYN_LOG": "dynamo_kv_router::scheduling::selector=debug",
+        },
+    ),
+    "multimodal_agg_frontend_decoding": VLLMConfig(
+        name="multimodal_agg_frontend_decoding_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_multimodal_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.skip("skip for XPU"),
+        ],
+        model="Qwen/Qwen2-VL-2B-Instruct",
+        # Pass --frontend-decoding to enable Rust frontend image decoding + NIXL RDMA transfer
+        script_args=[
+            "--model",
+            "Qwen/Qwen2-VL-2B-Instruct",
+            "--frontend-decoding",
+        ],
+        request_payloads=[
+            chat_payload(
+                [
+                    {
+                        "type": "text",
+                        "text": "What colors are in the following image? Respond only with the colors.",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": MULTIMODAL_IMG_URL},
+                    },
+                ],
+                repeat_count=1,
+                expected_response=["green"],
+                temperature=0.0,
+                max_tokens=100,
+            )
+        ],
+    ),
+    "multimodal_agg_qwen": VLLMConfig(
+        name="multimodal_agg_qwen_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_multimodal_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.skip(reason="skip for XPU"),
+        ],
+        model="Qwen/Qwen2.5-VL-7B-Instruct",
+        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
+        delayed_start=0,
+        timeout=360,
+        request_payloads=[
+            chat_payload(
+                [
+                    {
+                        "type": "text",
+                        "text": "What colors are in the following image? Respond only with the colors.",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": MULTIMODAL_IMG_URL},
+                    },
+                ],
+                repeat_count=1,
+                expected_response=["Green, White"],
+                max_tokens=100,
+            ),
+        ],
+    ),
+    "multimodal_agg_llava": VLLMConfig(
+        name="multimodal_agg_llava_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_multimodal_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.nightly,
+            # https://github.com/ai-dynamo/dynamo/issues/4501
+            pytest.mark.xfail(strict=False),
+        ],
+        model="llava-hf/llava-1.5-7b-hf",
+        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
+        delayed_start=0,
+        timeout=360,
+        request_payloads=[
+            # HTTP URL test
+            chat_payload(
+                [
+                    {"type": "text", "text": "What is in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
+                        },
+                    },
+                ],
+                repeat_count=1,
+                expected_response=["bus"],
+                temperature=0.0,
+            ),
+            # String content test - verifies string → array conversion for multimodal templates
+            chat_payload_default(
+                repeat_count=1,
+                expected_response=[],  # Just validate no error
+            ),
+        ],
+    ),
+    "aggregated_toolcalling": VLLMConfig(
+        name="aggregated_toolcalling_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_multimodal_xpu.sh",
+        marks=[
+            pytest.mark.xpu_2,
+            pytest.mark.multimodal,
+            pytest.mark.nightly,
+            pytest.mark.skip(reason="skip for XPU"),
+        ],
+        model="Qwen/Qwen3-VL-8B-Instruct",
+        script_args=[
+            "--model",
+            "Qwen/Qwen3-VL-8B-Instruct",
+            "--max-model-len",
+            "10000",
+            "--dyn-tool-call-parser",
+            "hermes",
+        ],
+        delayed_start=0,
+        timeout=600,
+        request_payloads=[
+            ToolCallingChatPayload(
+                body={
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "Describe what you see in this image in detail.",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": MULTIMODAL_IMG_URL},
+                                },
+                            ],
+                        }
+                    ],
+                    "tools": [
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": "describe_image",
+                                "description": "Provides detailed description of objects and scenes in an image",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "objects": {
+                                            "type": "array",
+                                            "items": {"type": "string"},
+                                            "description": "List of objects detected in the image",
+                                        },
+                                        "scene": {
+                                            "type": "string",
+                                            "description": "Overall scene description",
+                                        },
+                                    },
+                                    "required": ["objects", "scene"],
+                                },
+                            },
+                        }
+                    ],
+                    "tool_choice": "auto",
+                    "max_tokens": 1024,
+                },
+                repeat_count=1,
+                expected_response=["green"],  # Validate image understanding
+                expected_log=[],
+                expected_tool_name="describe_image",  # Validate tool call happened
+            )
+        ],
+    ),
+    "completions_only": VLLMConfig(
+        name="completions_only_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.post_merge,
+            pytest.mark.skip(reason="skip for XPU"),
+            pytest.mark.timeout(
+                420
+            ),  # 3x estimated time (60s) + download time (240s) for 7B model
+        ],
+        model="deepseek-ai/deepseek-llm-7b-base",
+        script_args=[
+            "--model",
+            "deepseek-ai/deepseek-llm-7b-base",
+            "--dyn-endpoint-types",
+            "completions",
+        ],
+        request_payloads=[
+            completion_payload_default(),
+        ],
+    ),
+    "guided_decoding": VLLMConfig(
+        name="guided_decoding_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_xpu.sh",
+        marks=[pytest.mark.xpu_1, pytest.mark.pre_merge],
+        model="Qwen/Qwen3-0.6B",
+        request_payloads=[
+            chat_payload(
+                "Generate a person with name and age",
+                repeat_count=1,
+                expected_response=['"name"', '"age"'],
+                temperature=0.0,
+                max_tokens=100,
+                extra_body={
+                    "guided_json": {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string"},
+                            "age": {"type": "integer"},
+                        },
+                        "required": ["name", "age"],
+                    }
+                },
+            ),
+            chat_payload(
+                "Generate a color name (red, blue, or green)",
+                repeat_count=1,
+                expected_response=["red", "blue", "green"],
+                temperature=0.0,
+                max_tokens=20,
+                extra_body={"guided_regex": r"(red|blue|green)"},
+            ),
+            chat_payload(
+                "Generate a color name (red, blue, or green)",
+                repeat_count=1,
+                expected_response=["red", "blue", "green"],
+                temperature=0.0,
+                max_tokens=20,
+                extra_body={"guided_choice": ["red", "blue", "green"]},
+            ),
+        ],
+    ),
+}
+
+
+@pytest.fixture(params=params_with_model_mark(vllm_configs))
+def vllm_config_test(request):
+    """Fixture that provides different vLLM test configurations"""
+    return vllm_configs[request.param]
+
+
+@pytest.mark.vllm
+@pytest.mark.e2e
+def test_serve_deployment(
+    vllm_config_test,
+    request,
+    runtime_services_dynamic_ports,
+    dynamo_dynamic_ports,
+    predownload_models,
+    image_server,
+):
+    """
+    Test dynamo serve deployments with different graph configurations.
+    """
+    config = dataclasses.replace(
+        vllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
+    )
+    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
+
+
+@pytest.mark.vllm
+@pytest.mark.e2e
+@pytest.mark.xpu_1
+@pytest.mark.nightly
+@pytest.mark.skip(reason="skip for XPU")
+@pytest.mark.timeout(360)  # Match VLLMConfig.timeout for this multimodal deployment
+def test_multimodal_b64(
+    request,
+    runtime_services_dynamic_ports,
+    dynamo_dynamic_ports,
+    predownload_models,
+):
+    """
+    Test multimodal inference with base64 url passthrough.
+
+    This test is separate because it loads the required image at runtime
+    (not collection time), ensuring it only fails when actually executed.
+    """
+    # Load B64 image at test execution time (uses real PNG even if MULTIMODAL_IMG is LFS pointer)
+    b64_img = base64.b64encode(get_multimodal_test_image_bytes()).decode()
+
+    # Create payload with B64 image
+    b64_payload = chat_payload(
+        [
+            {
+                "type": "text",
+                "text": "What colors are in the following image? Respond only with the colors.",
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
+            },
+        ],
+        repeat_count=1,
+        expected_response=["Green, White"],
+        max_tokens=100,
+    )
+
+    # Create test config
+    config = VLLMConfig(
+        name="test_multimodal_b64_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_multimodal_xpu.sh",
+        marks=[],  # markers at function-level
+        model="Qwen/Qwen2.5-VL-7B-Instruct",
+        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
+        delayed_start=0,
+        timeout=360,
+        request_payloads=[b64_payload],
+    )
+
+    config = dataclasses.replace(
+        config, frontend_port=dynamo_dynamic_ports.frontend_port
+    )
+    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
+
+
+# LoRA Test Directory
+lora_dir = os.path.join(vllm_dir, "launch/lora")
+
+
+def lora_chat_payload(
+    lora_name: str,
+    s3_uri: str,
+    system_port: int = DefaultPort.SYSTEM1.value,
+    repeat_count: int = 2,
+    expected_response: Optional[list] = None,
+    expected_log: Optional[list] = None,
+    max_tokens: int = 100,
+    temperature: float = 0.0,
+) -> LoraTestChatPayload:
+    """Create a LoRA-enabled chat payload for testing"""
+    return LoraTestChatPayload(
+        body={
+            "model": lora_name,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is deep learning? Answer in one sentence.",
+                }
+            ],
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stream": False,
+        },
+        lora_name=lora_name,
+        s3_uri=s3_uri,
+        system_port=system_port,
+        repeat_count=repeat_count,
+        expected_response=expected_response
+        or ["learning", "neural", "network", "AI", "model"],
+        expected_log=expected_log or [],
+    )
+
+
+@pytest.mark.vllm
+@pytest.mark.e2e
+@pytest.mark.xpu_1
+@pytest.mark.model("Qwen/Qwen3-0.6B")
+@pytest.mark.timeout(600)
+@pytest.mark.skip(reason="skip for XPU")
+@pytest.mark.post_merge
+def test_lora_aggregated(
+    request,
+    runtime_services_dynamic_ports,
+    predownload_models,
+    minio_lora_service,
+    dynamo_dynamic_ports,
+):
+    """
+    Test LoRA inference with aggregated vLLM deployment.
+
+    This test:
+    1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
+    2. Starts vLLM with LoRA support enabled
+    3. Loads the LoRA adapter via system API
+    4. Runs inference with the LoRA model
+    """
+    minio_config: MinioLoraConfig = minio_lora_service
+
+    # Create payload that loads LoRA and tests inference
+    lora_payload = lora_chat_payload(
+        lora_name=minio_config.lora_name,
+        s3_uri=minio_config.get_s3_uri(),
+        system_port=DefaultPort.SYSTEM1.value,
+        repeat_count=2,
+    )
+
+    # Create test config with MinIO environment variables
+    env_vars = minio_config.get_env_vars()
+    config = VLLMConfig(
+        name="test_lora_aggregated_xpu",
+        directory=vllm_dir,
+        script_name="lora/xpu/agg_lora_xpu.sh",
+        marks=[],  # markers at function-level
+        model="Qwen/Qwen3-0.6B",
+        timeout=600,
+        env=env_vars,
+        request_payloads=[lora_payload],
+    )
+
+    config = dataclasses.replace(
+        config, frontend_port=dynamo_dynamic_ports.frontend_port
+    )
+    run_serve_deployment(
+        config,
+        request,
+        ports=dynamo_dynamic_ports,
+        extra_env=env_vars,
+    )
+
+
+@pytest.mark.vllm
+@pytest.mark.e2e
+@pytest.mark.xpu_2
+@pytest.mark.model("Qwen/Qwen3-0.6B")
+@pytest.mark.timeout(600)
+@pytest.mark.skip(reason="skip for XPU")
+@pytest.mark.post_merge
+@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
+def test_lora_aggregated_router(
+    request,
+    minio_lora_service,
+    dynamo_dynamic_ports,
+    num_system_ports,
+):
+    """
+    Test LoRA inference with aggregated vLLM deployment using KV router.
+
+    This test:
+    1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
+    2. Starts multiple vLLM workers with LoRA support and KV router
+    3. Loads the LoRA adapter on both workers via system API
+    4. Runs inference with the LoRA model, verifying KV cache routing
+    """
+    assert (
+        num_system_ports >= 2
+    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
+    minio_config: MinioLoraConfig = minio_lora_service
+
+    # Create payloads that load LoRA on both workers and test inference
+    # Worker 1 (DefaultPort.SYSTEM1)
+    lora_payload_worker1 = lora_chat_payload(
+        lora_name=minio_config.lora_name,
+        s3_uri=minio_config.get_s3_uri(),
+        system_port=DefaultPort.SYSTEM1.value,
+        repeat_count=1,
+    )
+
+    # Worker 2 (DefaultPort.SYSTEM2)
+    lora_payload_worker2 = lora_chat_payload(
+        lora_name=minio_config.lora_name,
+        s3_uri=minio_config.get_s3_uri(),
+        system_port=DefaultPort.SYSTEM2.value,
+        repeat_count=1,
+    )
+
+    # Additional inference payload to test routing (LoRA already loaded)
+    inference_payload = chat_payload(
+        content="Explain machine learning in simple terms.",
+        repeat_count=2,
+        expected_response=["learn", "data", "algorithm", "model", "pattern"],
+        max_tokens=150,
+        temperature=0.0,
+    ).with_model(minio_config.lora_name)
+
+    # Add env vars including PYTHONHASHSEED for deterministic KV event IDs
+    env_vars = minio_config.get_env_vars()
+    env_vars["PYTHONHASHSEED"] = "0"
+
+    # Create test config with MinIO environment variables
+    config = VLLMConfig(
+        name="test_lora_aggregated_router_xpu",
+        directory=vllm_dir,
+        script_name="lora/xpu/agg_lora_router_xpu.sh",
+        marks=[],  # markers at function-level
+        model="Qwen/Qwen3-0.6B",
+        timeout=600,
+        env=env_vars,
+        request_payloads=[
+            lora_payload_worker1,
+            lora_payload_worker2,
+            inference_payload,
+        ],
+    )
+
+    config = dataclasses.replace(
+        config, frontend_port=dynamo_dynamic_ports.frontend_port
+    )
+    run_serve_deployment(
+        config, request, ports=dynamo_dynamic_ports, extra_env=env_vars
+    )