"vscode:/vscode.git/clone" did not exist on "84377e5da05c73618470891836f2cf0b37d7fc13"
Unverified Commit e0a2e7bb authored by zxue2's avatar zxue2 Committed by GitHub
Browse files

feat: enable intel xpu dockerfile (#6109)


Signed-off-by: default avatarZhan Xue <zhan.xue@intel.com>
Co-authored-by: default avatarXinYu Ye <xinyu.ye@intel.com>
Co-authored-by: default avatarHongming Zheng <hongming.zheng@intel.com>
parent ec63ff72
......@@ -10,10 +10,11 @@
# when building.
dynamo:
base_image: nvcr.io/nvidia/cuda-dl-base
cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
epp_image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1
frontend_image: nvcr.io/nvidia/base/ubuntu:noble-20250619
......@@ -34,15 +35,24 @@ dynamo:
efa_version: 1.45.1
vllm:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: 12.9.1-runtime-ubuntu24.04
vllm_ref: v0.16.0
cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.16.0
vllm_ref: v0.16.0
xpu:
base_image: intel/deep-learning-essentials
runtime_image: intel/deep-learning-essentials
base_image_tag: 2025.3.2-0-devel-ubuntu24.04
runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
vllm_ref: v0.14.0
flashinf_ref: v0.6.3
lmcache_ref: 0.3.14
vllm_omni_ref: "v0.16.0rc1"
......@@ -54,12 +64,14 @@ vllm:
modelexpress_ref: "3d73992ce6c10e52ddc54f7f12af35d27e173f15"
sglang:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang
cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: v0.5.9-runtime
cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: v0.5.9-cu130-runtime
enable_media_ffmpeg: "false"
......@@ -67,9 +79,9 @@ sglang:
enable_kvbm: "false"
trtllm:
base_image: nvcr.io/nvidia/pytorch
runtime_image: nvcr.io/nvidia/cuda-dl-base
cuda13.1:
base_image: nvcr.io/nvidia/pytorch
runtime_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.12-py3
runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04
enable_media_ffmpeg: "false"
......
......@@ -14,6 +14,7 @@ set -euo pipefail
VLLM_VER="0.16.0"
VLLM_REF="v${VLLM_VER}"
DEVICE="cuda"
# Basic Configurations
ARCH=$(uname -m)
......@@ -30,6 +31,10 @@ VLLM_OMNI_REF="v0.16.0rc1"
while [[ $# -gt 0 ]]; do
case $1 in
--device)
DEVICE="$2"
shift 2
;;
--vllm-ref)
VLLM_REF="$2"
shift 2
......@@ -71,8 +76,9 @@ while [[ $# -gt 0 ]]; do
shift 2
;;
-h|--help)
echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Usage: $0 [--device DEVICE] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Options:"
echo " --device DEVICE Device Selection (default: cuda)"
echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})"
echo " --arch ARCH Architecture amd64|arm64 (default: auto-detect)"
......@@ -107,35 +113,50 @@ elif [ "$ARCH" = "arm64" ]; then
fi
export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda
if [ "$DEVICE" = "cuda" ]; then
export CUDA_HOME=/usr/local/cuda
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
echo "=== Installing prerequisites ==="
uv pip install pip cuda-python
echo "=== Installing prerequisites ==="
uv pip install pip cuda-python
fi
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
if [ "$DEVICE" = "cuda" ]; then
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
elif [ "$DEVICE" = "xpu" ]; then
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
fi
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
if [ "$DEVICE" = "cuda" ]; then
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
else
echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi
elif [ "$DEVICE" = "xpu" ]; then
echo " LMCACHE_REF=$LMCACHE_REF "
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
uv pip install lmcache==${LMCACHE_REF}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
else
echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi
echo "\n=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts
cd $INSTALLATION_DIR
......@@ -144,48 +165,56 @@ cd vllm
git checkout $VLLM_REF
echo "✓ vLLM repository cloned"
if [ "$DEVICE" = "xpu" ]; then
echo "\n=== Installing vLLM ==="
git apply --ignore-whitespace /tmp/vllm-xpu.patch
uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
uv pip install --verbose --no-build-isolation .
fi
echo "\n=== Installing vLLM & FlashInfer ==="
if [ "$DEVICE" = "cuda" ]; then
echo "\n=== Installing vLLM & FlashInfer ==="
# Build GitHub release wheel URL per CUDA version
# CUDA 12 wheels have no +cu suffix and use manylinux_2_31
# CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS=""
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1
fi
VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"
# Install vLLM wheel
# CUDA 12: Try PyPI first, fall back to GitHub release
# CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
# does not prevent uv from resolving the cu12 variant)
echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
echo "✓ vLLM ${VLLM_VER} installed from PyPI"
# Build GitHub release wheel URL per CUDA version
# CUDA 12 wheels have no +cu suffix and use manylinux_2_31
# CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS=""
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1
fi
VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"
# Install vLLM wheel
# CUDA 12: Try PyPI first, fall back to GitHub release
# CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
# does not prevent uv from resolving the cu12 variant)
echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
echo "✓ vLLM ${VLLM_VER} installed from PyPI"
else
echo "⚠ PyPI install failed, installing from GitHub release..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
else
echo "⚠ PyPI install failed, installing from GitHub release..."
echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
else
echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
fi
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM-Omni ==="
......@@ -210,18 +239,19 @@ else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
else
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
fi
echo "✓ DeepGEMM installation completed"
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
if [ "$DEVICE" = "cuda" ]; then
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
else
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
fi
echo "✓ DeepGEMM installation completed"
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
fi
echo "\n✅ All installations completed successfully!"
......@@ -21,6 +21,15 @@ def parse_args():
choices=["dynamo", "vllm", "sglang", "trtllm"],
help="Dockerfile framework to use",
)
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda", "xpu"],
help="Dockerfile device to use",
)
parser.add_argument(
"--target",
type=str,
......@@ -58,6 +67,7 @@ def parse_args():
def validate_args(args):
valid_inputs = {
"vllm": {
"device": ["cuda", "xpu"],
"target": [
"runtime",
"dev",
......@@ -69,6 +79,7 @@ def validate_args(args):
"cuda_version": ["12.9", "13.0"],
},
"trtllm": {
"device": ["cuda"],
"target": [
"runtime",
"dev",
......@@ -80,6 +91,7 @@ def validate_args(args):
"cuda_version": ["13.1"],
},
"sglang": {
"device": ["cuda"],
"target": [
"runtime",
"dev",
......@@ -90,6 +102,7 @@ def validate_args(args):
"cuda_version": ["12.9", "13.0"],
},
"dynamo": {
"device": ["cuda"],
"target": [
"runtime",
"dev",
......@@ -106,14 +119,16 @@ def validate_args(args):
if (
args.target in valid_inputs[args.framework]["target"]
and args.cuda_version in valid_inputs[args.framework]["cuda_version"]
and args.device in valid_inputs[args.framework]["device"]
):
return
raise ValueError(
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version}]"
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version},device={args.device}]"
)
raise ValueError(
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version}]"
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version},device={args.device}]"
)
......@@ -128,6 +143,7 @@ def render(args, context, script_dir):
rendered = template.render(
context=context,
framework=args.framework,
device=args.device,
target=args.target,
platform=args.platform,
cuda_version=args.cuda_version,
......@@ -139,7 +155,7 @@ def render(args, context, script_dir):
if args.output_short_filename:
filename = "rendered.Dockerfile"
else:
filename = f"{args.framework}-{args.target}-cuda{args.cuda_version}-{args.platform}-rendered.Dockerfile"
filename = f"{args.framework}-{args.target}-{args.device}{args.cuda_version}-{args.platform}-rendered.Dockerfile"
with open(f"{script_dir}/{filename}", "w") as f:
f.write(cleaned)
......@@ -159,6 +175,9 @@ def render(args, context, script_dir):
def main():
args = parse_args()
validate_args(args)
# Clear cuda version for non-cuda device
if args.device != "cuda":
args.cuda_version = ""
script_dir = Path(__file__).parent
with open(f"{script_dir}/context.yaml", "r") as f:
context = yaml.safe_load(f)
......
......@@ -18,21 +18,35 @@
#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg
ARG ARCH={{ platform }}
ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }}
ARG DEVICE={{ device }}
{% if device == "cuda" -%}
{% set device_key = device + cuda_version -%}
{% else -%}
{% set device_key = device -%}
{% endif %}
# Python/CUDA configuration
ARG PYTHON_VERSION={{ context.dynamo.python_version }}
{% if device == "cuda" -%}
ARG CUDA_VERSION={{ cuda_version }}
ARG CUDA_MAJOR=${CUDA_VERSION%%.*}
{% endif %}
# Base and runtime images configuration
{% set cuda_context_key = "cuda" + cuda_version %}
ARG BASE_IMAGE={{ context[framework].base_image }}
ARG BASE_IMAGE_TAG={{ context[framework][cuda_context_key].base_image_tag }}
ARG BASE_IMAGE={{ context[framework][device_key].base_image }}
ARG BASE_IMAGE_TAG={{ context[framework][device_key].base_image_tag }}
{% if framework in ["sglang", "trtllm", "vllm"] -%}
ARG RUNTIME_IMAGE={{ context[framework].runtime_image }}
ARG RUNTIME_IMAGE_TAG={{ context[framework][cuda_context_key].runtime_image_tag }}
ARG RUNTIME_IMAGE={{ context[framework][device_key].runtime_image }}
ARG RUNTIME_IMAGE_TAG={{ context[framework][device_key].runtime_image_tag }}
{%- endif %}
# wheel builder image selection
{% if device == "xpu" %}
ARG WHEEL_BUILDER_IMAGE=${BASE_IMAGE}:${BASE_IMAGE_TAG}
{% else %}
ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_${ARCH_ALT}
{% endif %}
# Build configuration
ARG ENABLE_KVBM={{ context[framework].enable_kvbm }}
ARG CARGO_BUILD_JOBS
......@@ -42,7 +56,9 @@ ARG ETCD_VERSION={{ context.dynamo.etcd_version }}
ARG ENABLE_MEDIA_FFMPEG={{ context[framework].enable_media_ffmpeg }}
ARG FFMPEG_VERSION={{ context.dynamo.ffmpeg_version }}
{% if device == "cuda" -%}
ARG ENABLE_GPU_MEMORY_SERVICE={{ context[framework].enable_gpu_memory_service }}
{% endif %}
# SCCACHE configuration
ARG USE_SCCACHE
......@@ -52,8 +68,10 @@ ARG SCCACHE_REGION=""
# NIXL configuration
ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }}
ARG NIXL_REF={{ context.dynamo.nixl_ref }}
{% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }}
ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }}
{% endif %}
{% if target == "dev" or target == "local-dev" %}
ARG FRAMEWORK={{ framework }}
......@@ -66,19 +84,23 @@ ARG FRONTEND_IMAGE={{ context.dynamo.frontend_image }}
{% if framework == "vllm" -%}
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF={{ context.vllm.vllm_ref }}
ARG VLLM_REF={{ context[framework][device_key].vllm_ref }}
ARG MAX_JOBS={{ context.vllm.max_jobs }}
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
{% if device == "cuda" -%}
ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
{% endif %}
ARG LMCACHE_REF={{ context.vllm.lmcache_ref }}
ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
{% if device == "cuda" -%}
# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
# ModelExpress for P2P weight transfer (optional)
ARG ENABLE_MODELEXPRESS_P2P={{ context.vllm.enable_modelexpress_p2p }}
ARG MODELEXPRESS_REF={{ context.vllm.modelexpress_ref }}
{% endif %}
{%- endif -%}
{% if framework == "trtllm" %}
......
......@@ -15,6 +15,7 @@ FROM runtime AS dynamo_tools
ARG ARCH
ARG ARCH_ALT
ARG DEVICE
ENV DEBIAN_FRONTEND=noninteractive
ENV PATH=/usr/local/bin:${PATH}
......
......@@ -15,6 +15,7 @@ FROM aws AS local-dev
ENV USERNAME=dynamo
ARG USER_UID
ARG USER_GID
ARG DEVICE
# rustup is already at /home/dynamo/.rustup from the dev stage (COPY --from=wheel_builder
# with --chown=dynamo:0 --chmod=775), so no re-copy needed here.
......
......@@ -10,7 +10,7 @@
# PURPOSE: Framework development and vLLM compilation
#
# This stage builds and compiles framework dependencies including:
# - vLLM inference engine with CUDA support
# - vLLM inference engine with CUDA/XPU support
# - DeepGEMM and FlashInfer optimizations
# - All necessary build tools and compilation dependencies
# - Framework-level Python packages and extensions
......@@ -27,6 +27,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
ARG PYTHON_VERSION
ARG DEVICE
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
......@@ -65,15 +66,27 @@ ARG ARCH
# rebuilds from unrelated source code changes
ARG VLLM_REF
ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG LMCACHE_REF
ARG VLLM_OMNI_REF
{% if device == "cuda" %}
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG CUDA_VERSION
{% endif %}
ARG MAX_JOBS
ENV MAX_JOBS=$MAX_JOBS
{% if device == "cuda" %}
ENV CUDA_HOME=/usr/local/cuda
{% endif %}
{% if device == "xpu" %}
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/vllm-xpu-v0.14.0.patch -O /tmp/vllm-xpu.patch
ENV VLLM_TARGET_DEVICE=xpu
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
{% endif %}
# Install VLLM and related dependencies
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
......@@ -82,16 +95,19 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh \
--device $DEVICE \
--vllm-ref $VLLM_REF \
--max-jobs $MAX_JOBS \
--arch $ARCH \
--installation-dir /opt \
${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
--cuda-version $CUDA_VERSION
${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${CUDA_VERSION:+--cuda-version "$CUDA_VERSION"}
{% if device == "cuda" %}
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH
{% endif %}
......@@ -24,10 +24,19 @@
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
ARG DEVICE
WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
{% if device == "xpu" %}
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics
{% endif %}
{% if device == "cuda" %}
# Set CUDA_DEVICE_ORDER to ensure CUDA logical device IDs match NVML physical device IDs
# This fixes NVML InvalidArgument errors when CUDA_VISIBLE_DEVICES is set
ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
......@@ -51,13 +60,19 @@ ENV CPATH=/usr/local/cuda/include \
TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TRITON_CUDART_PATH=/usr/local/cuda/include
{% endif %}
### COPY NATS & ETCD ###
# Copy nats and etcd from dev image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
{% if device == "xpu" %}
ENV PATH=/usr/local/bin/etcd/:$PATH
{% else %}
# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
{% endif %}
# Copy uv to system /bin
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
......@@ -82,8 +97,10 @@ ENV PYTHON_VERSION=${PYTHON_VERSION}
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt-get update && \
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\
CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1) && \
if [ "$DEVICE" = "cuda" ]; then \
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\
CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1); \
fi && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \
......@@ -104,11 +121,39 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
# prometheus dependencies
ca-certificates \
# opencv-python-headless (vLLM dependency) requires libxcb for some functions
libxcb1 \
libxcb1 && \
if [ "$DEVICE" = "cuda" ]; then \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR} && \
cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}; \
fi && \
rm -rf /var/lib/apt/lists/*
{% if device == "xpu" %}
RUN apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
#ffmpeg \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
wget \
vim \
linux-libc-dev && \
# Install Intel GPU runtime packages
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing \
intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/intel-oneccl-2021.15.7.8_offline.sh && \
bash intel-oneccl-2021.15.7.8_offline.sh -a --silent --eula accept && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /etc/bash.bashrc && \
rm -f /opt/intel/oneapi/ccl/latest && \
ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest
{% endif %}
{% if context.vllm.enable_media_ffmpeg == "true" %}
# Copy ffmpeg libraries from wheel_builder (requires root, runs before USER dynamo)
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
......@@ -124,9 +169,15 @@ ENV HOME=/home/dynamo
# This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands
SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
{% if device == "xpu" %}
ENV NIXL_PREFIX=/opt/intel/intel_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% else %}
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% endif %}
# Site-packages path derived from PYTHON_VERSION ARG
ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
......@@ -138,15 +189,19 @@ ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
#
# Layer sizes (uncompressed): nvidia=4.5GB, flashinfer_jit_cache=4.1GB, torch=2.1GB,
# vllm=1.2GB, triton=592MB, flashinfer_cubin=437MB
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SITE_PACKAGES}/nvidia
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
{% endif %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
{% if platform == "amd64" -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
{% endif -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
{% endif %}
# Remaining packages and venv structure (bin/, include/, share/, etc.)
COPY --chmod=775 --chown=dynamo:0 --from=framework \
--exclude=lib/python*/site-packages/nvidia \
......@@ -166,26 +221,37 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm
# Copy UCX and NIXL to system directories (read-only, no group-write needed)
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
{% if device == "xpu" %}
COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/. ${NIXL_LIB_DIR}/
{% else %}
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
{% endif %}
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
{% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
{% endif %}
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
{% if device == "cuda" %}
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH
ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
{% endif %}
# TODO: skip /workspace COPYs for dev/local-dev (bind-mounted from host, these get shadowed)
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
......@@ -226,6 +292,7 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
uv pip install /opt/dynamo/wheelhouse/nixl/nixl*.whl
{% endif %}
{% if device == "cuda" %}
# Install gpu_memory_service wheel if enabled (all targets)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
......@@ -235,7 +302,6 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
if [ -n "$GMS_WHEEL" ]; then uv pip install "$GMS_WHEEL"; fi; \
fi
# Install ModelExpress for P2P weight transfer (optional)
ARG ENABLE_MODELEXPRESS_P2P
ARG MODELEXPRESS_REF
......@@ -243,6 +309,7 @@ RUN if [ "${ENABLE_MODELEXPRESS_P2P}" = "true" ]; then \
echo "Installing ModelExpress from ref: ${MODELEXPRESS_REF}" && \
uv pip install "modelexpress @ git+https://github.com/ai-dynamo/modelexpress.git@${MODELEXPRESS_REF}#subdirectory=modelexpress_client/python"; \
fi
{% endif %}
# Install common and test dependencies. Cache uv downloads; uv handles its own locking for this cache.
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
......@@ -274,6 +341,7 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} &
echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
{% if device == "cuda" %}
# Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks)
# This recreates proper symlinks to save space and suppress ldconfig warnings
RUN cd /usr/local/lib && \
......@@ -304,15 +372,25 @@ RUN cd /usr/local/lib && \
fi; \
done && \
ldconfig
{% endif %}
USER dynamo
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
{% if device == "xpu" %}
RUN uv pip uninstall triton triton-xpu && \
uv pip install triton-xpu==3.6.0 --extra-index-url=https://download.pytorch.org/whl/test/xpu && \
uv pip uninstall oneccl && \
uv pip uninstall oneccl-devel
SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /etc/bash.bashrc && exec bash"]
{% else %}
# In vLLM 0.12 the default sampler changed on the forward pass.
# We need to enable this to enable the cuda kernels.
ENV VLLM_USE_FLASHINFER_SAMPLER=1
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
{% endif %}
......@@ -7,21 +7,21 @@
##### Wheel Build Image ##########
##################################
# Redeclare ARCH_ALT ARG so it's available for interpolation in the FROM instruction
ARG ARCH_ALT
FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder
FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder
# Redeclare ARGs for this stage
ARG ARCH
ARG ARCH_ALT
ARG CARGO_BUILD_JOBS
ARG DEVICE
WORKDIR /workspace
{% if device == "cuda" %}
# Copy CUDA from base stage
COPY --from=dynamo_base /usr/local/cuda /usr/local/cuda
COPY --from=dynamo_base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf
{% endif %}
# Set environment variables first so they can be used in COPY commands
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
......@@ -34,6 +34,71 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
{% if device == "xpu" %}
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/ucx-v1.12.0.patch -O /tmp/ucx.patch
RUN apt clean && apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
#ffmpeg \
ca-certificates \
zip \
unzip \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
libaio-dev \
numactl \
wget \
vim \
linux-libc-dev && \
# Install Intel GPU runtime packages
apt update -y && apt upgrade -y && \
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd \
libze-intel-gpu-raytracing intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# NIXL build dependencies
autoconf \
automake \
cmake \
git-lfs \
libtool \
meson \
net-tools \
ninja-build \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall --no-install-recommends \
libibverbs-dev \
rdma-core \
ibverbs-utils \
libibumad-dev \
libnuma-dev \
librdmacm-dev \
ibverbs-providers \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
{% endif %}
{% if device == "cuda" %}
# Install system dependencies
# Cache dnf downloads; sharing=locked avoids dnf/rpm races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/dnf,sharing=locked \
......@@ -84,7 +149,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \
LD_LIBRARY_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64:${LD_LIBRARY_PATH}" \
CC="/opt/rh/gcc-toolset-14/root/usr/bin/gcc" \
CXX="/opt/rh/gcc-toolset-14/root/usr/bin/g++"
{% endif %}
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \
......@@ -104,10 +169,16 @@ RUN set -eux; \
# Point build tools explicitly at the modern protoc
ENV PROTOC=/usr/local/bin/protoc
{% if device == "xpu" %}
# Install uv package manager
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH
{% else %}
ENV CUDA_PATH=/usr/local/cuda \
PATH=/usr/local/cuda/bin:$PATH \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \
NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
{% endif %}
# Create virtual environment for building wheels
ARG PYTHON_VERSION
......@@ -120,6 +191,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ARG NIXL_UCX_REF
ARG NIXL_REF
{% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF
# Build and install gdrcopy
......@@ -129,6 +202,7 @@ RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/g
rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \
rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm
{% endif %}
# sccache binary is pre-installed in dynamo_base; stage it off-PATH so
# Meson doesn't auto-detect it as a CUDA compiler launcher
......@@ -159,7 +233,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \
fi && \
dnf install -y pkg-config && \
if [ "$DEVICE" = "xpu" ]; then \
apt-get update -y && apt-get install -y pkg-config; \
apt-get clean && rm -rf /var/lib/apt/lists/*; \
elif [ "$DEVICE" = "cuda" ]; then \
dnf install -y pkg-config; \
fi && \
cd /tmp && \
curl -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \
tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \
......@@ -195,11 +274,30 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
eval $(/tmp/use-sccache.sh setup-env); \
fi && \
cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \
cd ucx && \
git checkout $NIXL_UCX_REF && \
./autogen.sh && \
./contrib/configure-release \
git clone https://github.com/openucx/ucx.git && \
cd ucx && \
git checkout $NIXL_UCX_REF && \
if [ "$DEVICE" = "xpu" ]; then \
git apply --ignore-whitespace /tmp/ucx.patch; \
fi && \
./autogen.sh && \
if [ "$DEVICE" = "xpu" ]; then \
./contrib/configure-release \
--prefix=/usr/local/ucx \
--with-ze \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-verbs \
--with-dm \
--with-efa \
--without-cuda \
--enable-mt; \
elif [ "$DEVICE" = "cuda" ]; then \
./contrib/configure-release \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
......@@ -212,7 +310,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--with-dm \
--with-gdrcopy=/usr/local \
--with-efa \
--enable-mt && \
--enable-mt; \
fi && \
make -j && \
make -j install-strip && \
/tmp/use-sccache.sh show-stats "UCX" && \
......@@ -220,6 +319,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
ldconfig
{% if device == "cuda" %}
ARG NIXL_LIBFABRIC_REF
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
......@@ -248,8 +348,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
/tmp/use-sccache.sh show-stats "LIBFABRIC" && \
echo "/usr/local/libfabric/lib" > /etc/ld.so.conf.d/libfabric.conf && \
ldconfig
{% endif %}
{% if framework == "vllm" %}
{% if framework == "vllm" and device == "cuda" %}
# Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support)
ARG AWS_SDK_CPP_VERSION=1.11.581
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
......@@ -277,7 +378,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
{% endif %}
# build and install nixl
{% if device == "cuda" %}
ARG CUDA_MAJOR
{% endif %}
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
......@@ -288,22 +392,38 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
git clone "https://github.com/ai-dynamo/nixl.git" && \
cd nixl && \
git checkout ${NIXL_REF} && \
PKG_NAME="nixl-cu${CUDA_MAJOR}" && \
if [ "$DEVICE" = "cuda" ]; then \
PKG_NAME="nixl-cu${CUDA_MAJOR}"; \
elif [ "$DEVICE" = "xpu" ]; then \
PKG_NAME="nixl-xpu"; \
fi && \
./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml && \
mkdir build && \
meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
-Dcudapath_lib="/usr/local/cuda/lib64" \
-Dcudapath_inc="/usr/local/cuda/include" \
-Ducx_path="/usr/local/ucx" \
-Dlibfabric_path="/usr/local/libfabric" && \
if [ "$DEVICE" = "cuda" ]; then \
meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
-Dcudapath_lib="/usr/local/cuda/lib64" \
-Dcudapath_inc="/usr/local/cuda/include" \
-Ducx_path="/usr/local/ucx" \
-Dlibfabric_path="/usr/local/libfabric"; \
elif [ "$DEVICE" = "xpu" ]; then \
meson setup build/ --prefix=/opt/intel/intel_nixl --buildtype=release \
-Ducx_path="/usr/local/ucx"; \
fi && \
cd build && \
ninja && \
ninja install && \
/tmp/use-sccache.sh show-stats "NIXL"
{% if device == "xpu" %}
ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \
NIXL_PREFIX=/opt/intel/intel_nixl
{% else %}
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
NIXL_PREFIX=/opt/nvidia/nvda_nixl
{% endif %}
ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH}
RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
......@@ -355,21 +475,24 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
else \
maturin build --release --out /opt/dynamo/dist; \
fi && \
if [ "$ENABLE_KVBM" == "true" ]; then \
if [ "$ENABLE_KVBM" = "true" ]; then \
cd /opt/dynamo/lib/bindings/kvbm && \
maturin build --release --out target/wheels && \
auditwheel repair \
--exclude libnixl.so \
--exclude libnixl_build.so \
--exclude libnixl_common.so \
--exclude 'lib*.so*' \
--plat manylinux_2_28_${ARCH_ALT} \
--wheel-dir /opt/dynamo/dist \
target/wheels/*.whl; \
if [ "$DEVICE" = "cuda" ]; then \
auditwheel repair \
--exclude libnixl.so \
--exclude libnixl_build.so \
--exclude libnixl_common.so \
--exclude 'lib*.so*' \
--plat manylinux_2_28_${ARCH_ALT} \
--wheel-dir /opt/dynamo/dist \
target/wheels/*.whl; \
elif [ "$DEVICE" = "xpu" ]; then \
cp target/wheels/*.whl /opt/dynamo/dist/; \
fi; \
fi && \
/tmp/use-sccache.sh show-stats "Dynamo"
{% else %}
# Dev/local-dev targets do not have pre-built wheels or /workspace source code.
# After you start the local-dev/dev container, you will need to build from source:
......@@ -387,6 +510,8 @@ COPY lib/gpu_memory_service/ /opt/dynamo/lib/gpu_memory_service/
{% endif %}
# Build gpu-memory-service wheel → /opt/dynamo/dist/gpu_memory_service*.whl (small C++ extension, fast build -- all targets, all frameworks)
{% if device == "cuda" %}
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
......@@ -394,3 +519,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
{% endif %}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment