feat: Update XPU dockerfile with vllm-v0.17.1 (#7500)

Signed-off-by: Hongming Zheng <hongming.zheng@intel.com> Co-authored-by: Zhan Xue <zhan.xue@intel.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

feat: Update XPU dockerfile with vllm-v0.17.1 (#7500)
Signed-off-by: Hongming Zheng <hongming.zheng@intel.com> Co-authored-by: Zhan Xue <zhan.xue@intel.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
06f17011 · ZhengHongming888 · GitHub · a58bcc31 · 06f17011 · 06f17011
Unverified Commit 06f17011 authored Mar 26, 2026 by ZhengHongming888 Committed by GitHub Mar 26, 2026
5 changed files
--- a/container/context.yaml
+++ b/container/context.yaml
@@ -52,7 +52,7 @@ vllm:
    runtime_image: intel/deep-learning-essentials
    base_image_tag: 2025.3.2-0-devel-ubuntu24.04
    runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
-    vllm_ref: v0.14.0
+    vllm_ref: v0.17.1
  cpu:
    base_image: ubuntu
    runtime_image: ubuntu

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -143,7 +143,6 @@ echo "✓ vLLM repository cloned"
 if [ "$DEVICE" = "xpu" ]; then
    echo "\n=== Installing vLLM ==="
-    git apply --ignore-whitespace /tmp/vllm-xpu.patch
    uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
    uv pip install --verbose --no-build-isolation .
 fi

--- a/container/templates/local_dev.Dockerfile
+++ b/container/templates/local_dev.Dockerfile
@@ -80,7 +80,7 @@ RUN mkdir -p /home/$USERNAME/.cache/ \
 {% if device == "xpu" %}
 SHELL ["bash", "-c"]
-CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
+CMD ["bash", "-c", "source /home/$USERNAME/.bashrc && exec bash"]
 {% else %}
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []

--- a/container/templates/vllm_framework.Dockerfile
+++ b/container/templates/vllm_framework.Dockerfile
@@ -87,7 +87,6 @@ ENV CUDA_HOME=/usr/local/cuda
 {% endif %}
 {% if device == "xpu" %}
-RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/vllm-xpu-v0.14.0.patch -O /tmp/vllm-xpu.patch
 ENV VLLM_TARGET_DEVICE=xpu
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 {% endif %}

--- a/examples/backends/vllm/launch/xpu/disagg_multimodal_epd_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/disagg_multimodal_epd_xpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+# Default values
+MODEL_NAME="llava-hf/llava-1.5-7b-hf"
+# --single-gpu: Packs all 3 workers (encode, prefill, decode) onto a single GPU.
+# This is intended for functional testing with small models (e.g. 2B) where CI
+# only has 1 GPU available. It reduces performance by:
+#   - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
+#   - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
+#   - Limiting --max-model-len to 4096 tokens on P/D workers
+#   - Limiting P/D workers to image=1,video=0,audio=0 (--limit-mm-per-prompt)
+#   - Using lower gpu-memory-utilization fractions to share the GPU
+SINGLE_GPU=false
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME=$2
+            shift 2
+            ;;
+        --single-gpu)
+            SINGLE_GPU=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Disaggregated multimodal serving with separate Encode/Prefill/Decode workers"
+            echo ""
+            echo "Options:"
+            echo "  --model <model_name>          Specify the VLM model to use (default: $MODEL_NAME)"
+            echo "                                LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
+            echo "  --single-gpu                  Pack all 3 workers on 1 GPU (for small models, e.g. 2B)"
+            echo "  -h, --help                    Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  $0 --model llava-hf/llava-1.5-7b-hf"
+            echo "  $0 --model microsoft/Phi-3.5-vision-instruct"
+            echo "  $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
+            echo "  $0 --model Qwen/Qwen3-VL-2B-Instruct --single-gpu"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+# Device platform and affinity env name.
+# DEVICE_PLATFORM supports: cuda, xpu
+DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
+if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
+    if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
+        DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
+    else
+        DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
+    fi
+fi
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+if [[ "$SINGLE_GPU" == "true" ]]; then
+    GPU_LABEL="1 GPU"
+else
+    GPU_LABEL="3 GPUs"
+fi
+print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"
+# Start frontend (no router mode)
+echo "Starting frontend..."
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python -m dynamo.frontend &
+EXTRA_ARGS=""
+PD_EXTRA_ARGS=""
+# GPU assignments (override via environment variables)
+DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
+DYN_PREFILL_WORKER_GPU=${DYN_PREFILL_WORKER_GPU:-1}
+DYN_DECODE_WORKER_GPU=${DYN_DECODE_WORKER_GPU:-2}
+# GPU memory utilization for workers.
+# NOTE: --kv-cache-memory-bytes (set below for P/D workers) overrides
+# --gpu-memory-utilization for KV cache sizing. Per vLLM CacheConfig:
+# "kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization"
+# Ref: https://docs.vllm.ai/en/stable/api/vllm/config/cache/
+# Therefore _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect on actual VRAM
+# usage when --kv-cache-memory-bytes is set.
+if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then
+    echo "WARNING: _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE is set but has no effect here because" >&2
+    echo "  --kv-cache-memory-bytes overrides --gpu-memory-utilization in vLLM." >&2
+fi
+DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
+DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
+DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
+# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
+# memory profiling entirely (both language model and multimodal encoder), which avoids
+# OOM during profiling when 3 workers share a GPU. 512 MB covers the
+# minimum vLLM requires for max_model_len=4096 on Qwen3-VL-2B.
+PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
+if [[ "$SINGLE_GPU" == "true" ]]; then
+    EXTRA_ARGS="--enforce-eager"
+    PD_EXTRA_ARGS="--max-model-len 4096 --kv-cache-memory-bytes $PD_KV_CACHE_BYTES --limit-mm-per-prompt {\"image\":1,\"video\":0,\"audio\":0}"
+fi
+if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
+    EXTRA_ARGS="$EXTRA_ARGS --block-size 64"
+    PD_EXTRA_ARGS="--max-model-len 10240"
+fi
+# Start encode worker
+echo "Starting encode worker on GPU $DYN_ENCODE_WORKER_GPU (GPU mem: $DYN_ENCODE_GPU_MEM)..."
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
+env $DEVICE_AFFINITY_ENV=$DYN_ENCODE_WORKER_GPU \
+python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
+# Start prefill worker (also handles encode routing via --route-to-encoder)
+echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
+VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
+env $DEVICE_AFFINITY_ENV=$DYN_PREFILL_WORKER_GPU \
+python -m dynamo.vllm --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
+# Start decode worker
+echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
+VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
+env $DEVICE_AFFINITY_ENV=$DYN_DECODE_WORKER_GPU \
+python -m dynamo.vllm --disaggregation-mode decode --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
+echo "=================================================="
+echo "All components started. Waiting for initialization..."
+echo "=================================================="
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit