Unverified Commit 06f17011 authored by ZhengHongming888's avatar ZhengHongming888 Committed by GitHub
Browse files

feat: Update XPU dockerfile with vllm-v0.17.1 (#7500)


Signed-off-by: default avatarHongming Zheng <hongming.zheng@intel.com>
Co-authored-by: default avatarZhan Xue <zhan.xue@intel.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent a58bcc31
...@@ -52,7 +52,7 @@ vllm: ...@@ -52,7 +52,7 @@ vllm:
runtime_image: intel/deep-learning-essentials runtime_image: intel/deep-learning-essentials
base_image_tag: 2025.3.2-0-devel-ubuntu24.04 base_image_tag: 2025.3.2-0-devel-ubuntu24.04
runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04 runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
vllm_ref: v0.14.0 vllm_ref: v0.17.1
cpu: cpu:
base_image: ubuntu base_image: ubuntu
runtime_image: ubuntu runtime_image: ubuntu
......
...@@ -143,7 +143,6 @@ echo "✓ vLLM repository cloned" ...@@ -143,7 +143,6 @@ echo "✓ vLLM repository cloned"
if [ "$DEVICE" = "xpu" ]; then if [ "$DEVICE" = "xpu" ]; then
echo "\n=== Installing vLLM ===" echo "\n=== Installing vLLM ==="
git apply --ignore-whitespace /tmp/vllm-xpu.patch
uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
uv pip install --verbose --no-build-isolation . uv pip install --verbose --no-build-isolation .
fi fi
......
...@@ -80,7 +80,7 @@ RUN mkdir -p /home/$USERNAME/.cache/ \ ...@@ -80,7 +80,7 @@ RUN mkdir -p /home/$USERNAME/.cache/ \
{% if device == "xpu" %} {% if device == "xpu" %}
SHELL ["bash", "-c"] SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /root/.bashrc && exec bash"] CMD ["bash", "-c", "source /home/$USERNAME/.bashrc && exec bash"]
{% else %} {% else %}
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
......
...@@ -87,7 +87,6 @@ ENV CUDA_HOME=/usr/local/cuda ...@@ -87,7 +87,6 @@ ENV CUDA_HOME=/usr/local/cuda
{% endif %} {% endif %}
{% if device == "xpu" %} {% if device == "xpu" %}
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/vllm-xpu-v0.14.0.patch -O /tmp/vllm-xpu.patch
ENV VLLM_TARGET_DEVICE=xpu ENV VLLM_TARGET_DEVICE=xpu
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
{% endif %} {% endif %}
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
# --single-gpu: Packs all 3 workers (encode, prefill, decode) onto a single GPU.
# This is intended for functional testing with small models (e.g. 2B) where CI
# only has 1 GPU available. It reduces performance by:
# - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
# - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
# - Limiting --max-model-len to 4096 tokens on P/D workers
# - Limiting P/D workers to image=1,video=0,audio=0 (--limit-mm-per-prompt)
# - Using lower gpu-memory-utilization fractions to share the GPU
SINGLE_GPU=false
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--single-gpu)
SINGLE_GPU=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Disaggregated multimodal serving with separate Encode/Prefill/Decode workers"
echo ""
echo "Options:"
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
echo " --single-gpu Pack all 3 workers on 1 GPU (for small models, e.g. 2B)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 --model llava-hf/llava-1.5-7b-hf"
echo " $0 --model microsoft/Phi-3.5-vision-instruct"
echo " $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
echo " $0 --model Qwen/Qwen3-VL-2B-Instruct --single-gpu"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Device platform and affinity env name.
# DEVICE_PLATFORM supports: cuda, xpu
DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
else
DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
fi
fi
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
if [[ "$SINGLE_GPU" == "true" ]]; then
GPU_LABEL="1 GPU"
else
GPU_LABEL="3 GPUs"
fi
print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"
# Start frontend (no router mode)
echo "Starting frontend..."
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
EXTRA_ARGS=""
PD_EXTRA_ARGS=""
# GPU assignments (override via environment variables)
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
DYN_PREFILL_WORKER_GPU=${DYN_PREFILL_WORKER_GPU:-1}
DYN_DECODE_WORKER_GPU=${DYN_DECODE_WORKER_GPU:-2}
# GPU memory utilization for workers.
# NOTE: --kv-cache-memory-bytes (set below for P/D workers) overrides
# --gpu-memory-utilization for KV cache sizing. Per vLLM CacheConfig:
# "kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization"
# Ref: https://docs.vllm.ai/en/stable/api/vllm/config/cache/
# Therefore _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect on actual VRAM
# usage when --kv-cache-memory-bytes is set.
if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then
echo "WARNING: _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE is set but has no effect here because" >&2
echo " --kv-cache-memory-bytes overrides --gpu-memory-utilization in vLLM." >&2
fi
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
# memory profiling entirely (both language model and multimodal encoder), which avoids
# OOM during profiling when 3 workers share a GPU. 512 MB covers the
# minimum vLLM requires for max_model_len=4096 on Qwen3-VL-2B.
PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--enforce-eager"
PD_EXTRA_ARGS="--max-model-len 4096 --kv-cache-memory-bytes $PD_KV_CACHE_BYTES --limit-mm-per-prompt {\"image\":1,\"video\":0,\"audio\":0}"
fi
if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
EXTRA_ARGS="$EXTRA_ARGS --block-size 64"
PD_EXTRA_ARGS="--max-model-len 10240"
fi
# Start encode worker
echo "Starting encode worker on GPU $DYN_ENCODE_WORKER_GPU (GPU mem: $DYN_ENCODE_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
env $DEVICE_AFFINITY_ENV=$DYN_ENCODE_WORKER_GPU \
python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
# Start prefill worker (also handles encode routing via --route-to-encoder)
echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
env $DEVICE_AFFINITY_ENV=$DYN_PREFILL_WORKER_GPU \
python -m dynamo.vllm --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Start decode worker
echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
env $DEVICE_AFFINITY_ENV=$DYN_DECODE_WORKER_GPU \
python -m dynamo.vllm --disaggregation-mode decode --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
echo "=================================================="
echo "All components started. Waiting for initialization..."
echo "=================================================="
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment