Unverified Commit e0a2e7bb authored by zxue2's avatar zxue2 Committed by GitHub
Browse files

feat: enable intel xpu dockerfile (#6109)


Signed-off-by: default avatarZhan Xue <zhan.xue@intel.com>
Co-authored-by: default avatarXinYu Ye <xinyu.ye@intel.com>
Co-authored-by: default avatarHongming Zheng <hongming.zheng@intel.com>
parent ec63ff72
...@@ -10,10 +10,11 @@ ...@@ -10,10 +10,11 @@
# when building. # when building.
dynamo: dynamo:
base_image: nvcr.io/nvidia/cuda-dl-base
cuda12.9: cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
epp_image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1 epp_image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1
frontend_image: nvcr.io/nvidia/base/ubuntu:noble-20250619 frontend_image: nvcr.io/nvidia/base/ubuntu:noble-20250619
...@@ -34,15 +35,24 @@ dynamo: ...@@ -34,15 +35,24 @@ dynamo:
efa_version: 1.45.1 efa_version: 1.45.1
vllm: vllm:
cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda runtime_image: nvcr.io/nvidia/cuda
cuda12.9:
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: 12.9.1-runtime-ubuntu24.04 runtime_image_tag: 12.9.1-runtime-ubuntu24.04
vllm_ref: v0.16.0
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04 runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.16.0 vllm_ref: v0.16.0
xpu:
base_image: intel/deep-learning-essentials
runtime_image: intel/deep-learning-essentials
base_image_tag: 2025.3.2-0-devel-ubuntu24.04
runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
vllm_ref: v0.14.0
flashinf_ref: v0.6.3 flashinf_ref: v0.6.3
lmcache_ref: 0.3.14 lmcache_ref: 0.3.14
vllm_omni_ref: "v0.16.0rc1" vllm_omni_ref: "v0.16.0rc1"
...@@ -54,12 +64,14 @@ vllm: ...@@ -54,12 +64,14 @@ vllm:
modelexpress_ref: "3d73992ce6c10e52ddc54f7f12af35d27e173f15" modelexpress_ref: "3d73992ce6c10e52ddc54f7f12af35d27e173f15"
sglang: sglang:
cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang runtime_image: lmsysorg/sglang
cuda12.9:
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: v0.5.9-runtime runtime_image_tag: v0.5.9-runtime
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: v0.5.9-cu130-runtime runtime_image_tag: v0.5.9-cu130-runtime
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
...@@ -67,9 +79,9 @@ sglang: ...@@ -67,9 +79,9 @@ sglang:
enable_kvbm: "false" enable_kvbm: "false"
trtllm: trtllm:
cuda13.1:
base_image: nvcr.io/nvidia/pytorch base_image: nvcr.io/nvidia/pytorch
runtime_image: nvcr.io/nvidia/cuda-dl-base runtime_image: nvcr.io/nvidia/cuda-dl-base
cuda13.1:
base_image_tag: 25.12-py3 base_image_tag: 25.12-py3
runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04 runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
......
...@@ -14,6 +14,7 @@ set -euo pipefail ...@@ -14,6 +14,7 @@ set -euo pipefail
VLLM_VER="0.16.0" VLLM_VER="0.16.0"
VLLM_REF="v${VLLM_VER}" VLLM_REF="v${VLLM_VER}"
DEVICE="cuda"
# Basic Configurations # Basic Configurations
ARCH=$(uname -m) ARCH=$(uname -m)
...@@ -30,6 +31,10 @@ VLLM_OMNI_REF="v0.16.0rc1" ...@@ -30,6 +31,10 @@ VLLM_OMNI_REF="v0.16.0rc1"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
--device)
DEVICE="$2"
shift 2
;;
--vllm-ref) --vllm-ref)
VLLM_REF="$2" VLLM_REF="$2"
shift 2 shift 2
...@@ -71,8 +76,9 @@ while [[ $# -gt 0 ]]; do ...@@ -71,8 +76,9 @@ while [[ $# -gt 0 ]]; do
shift 2 shift 2
;; ;;
-h|--help) -h|--help)
echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]" echo "Usage: $0 [--device DEVICE] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Options:" echo "Options:"
echo " --device DEVICE Device Selection (default: cuda)"
echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})" echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})" echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})"
echo " --arch ARCH Architecture amd64|arm64 (default: auto-detect)" echo " --arch ARCH Architecture amd64|arm64 (default: auto-detect)"
...@@ -107,20 +113,28 @@ elif [ "$ARCH" = "arm64" ]; then ...@@ -107,20 +113,28 @@ elif [ "$ARCH" = "arm64" ]; then
fi fi
export MAX_JOBS=$MAX_JOBS export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda if [ "$DEVICE" = "cuda" ]; then
export CUDA_HOME=/usr/local/cuda
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129") # Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')" TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
echo "=== Installing prerequisites ===" echo "=== Installing prerequisites ==="
uv pip install pip cuda-python uv pip install pip cuda-python
fi
echo "\n=== Configuration Summary ===" if [ "$DEVICE" = "cuda" ]; then
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND" echo "\n=== Configuration Summary ==="
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR" echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
elif [ "$DEVICE" = "xpu" ]; then
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
fi
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if [ "$DEVICE" = "cuda" ]; then
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF" echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo "\n=== Installing LMCache ===" echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then if [ "$ARCH" = "amd64" ]; then
...@@ -131,11 +145,18 @@ if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then ...@@ -131,11 +145,18 @@ if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
else else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)" echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi fi
else else
echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF" echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi
elif [ "$DEVICE" = "xpu" ]; then
echo " LMCACHE_REF=$LMCACHE_REF "
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
uv pip install lmcache==${LMCACHE_REF}
echo "✓ LMCache ${LMCACHE_REF} installed"
fi
fi fi
echo "\n=== Cloning vLLM repository ===" echo "\n=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts # Clone needed for DeepGEMM and EP kernels install scripts
cd $INSTALLATION_DIR cd $INSTALLATION_DIR
...@@ -144,30 +165,37 @@ cd vllm ...@@ -144,30 +165,37 @@ cd vllm
git checkout $VLLM_REF git checkout $VLLM_REF
echo "✓ vLLM repository cloned" echo "✓ vLLM repository cloned"
if [ "$DEVICE" = "xpu" ]; then
echo "\n=== Installing vLLM ==="
git apply --ignore-whitespace /tmp/vllm-xpu.patch
uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
uv pip install --verbose --no-build-isolation .
fi
echo "\n=== Installing vLLM & FlashInfer ===" if [ "$DEVICE" = "cuda" ]; then
echo "\n=== Installing vLLM & FlashInfer ==="
# Build GitHub release wheel URL per CUDA version # Build GitHub release wheel URL per CUDA version
# CUDA 12 wheels have no +cu suffix and use manylinux_2_31 # CUDA 12 wheels have no +cu suffix and use manylinux_2_31
# CUDA 13 wheels have +cu130 suffix and use manylinux_2_35 # CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl" VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS="" EXTRA_PIP_ARGS=""
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl" VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}" EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
else else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}" echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1 exit 1
fi fi
VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}" VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"
# Install vLLM wheel # Install vLLM wheel
# CUDA 12: Try PyPI first, fall back to GitHub release # CUDA 12: Try PyPI first, fall back to GitHub release
# CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend # CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
# does not prevent uv from resolving the cu12 variant) # does not prevent uv from resolving the cu12 variant)
echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..." echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
echo "✓ vLLM ${VLLM_VER} installed from PyPI" echo "✓ vLLM ${VLLM_VER} installed from PyPI"
else else
...@@ -177,15 +205,16 @@ if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then ...@@ -177,15 +205,16 @@ if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
--torch-backend=${TORCH_BACKEND} --torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub" echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi fi
else else
echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..." echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
uv pip install ${EXTRA_PIP_ARGS} \ uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \ "${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND} --torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub" echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
fi fi
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed" echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM-Omni ===" echo "\n=== Installing vLLM-Omni ==="
...@@ -210,18 +239,19 @@ else ...@@ -210,18 +239,19 @@ else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)" echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi fi
echo "\n=== Installing DeepGEMM ===" if [ "$DEVICE" = "cuda" ]; then
cd $INSTALLATION_DIR/vllm/tools echo "\n=== Installing DeepGEMM ==="
if [ -n "$DEEPGEMM_REF" ]; then cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF" bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
else else
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
fi fi
echo "✓ DeepGEMM installation completed" echo "✓ DeepGEMM installation completed"
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
fi
echo "\n✅ All installations completed successfully!" echo "\n✅ All installations completed successfully!"
...@@ -21,6 +21,15 @@ def parse_args(): ...@@ -21,6 +21,15 @@ def parse_args():
choices=["dynamo", "vllm", "sglang", "trtllm"], choices=["dynamo", "vllm", "sglang", "trtllm"],
help="Dockerfile framework to use", help="Dockerfile framework to use",
) )
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda", "xpu"],
help="Dockerfile device to use",
)
parser.add_argument( parser.add_argument(
"--target", "--target",
type=str, type=str,
...@@ -58,6 +67,7 @@ def parse_args(): ...@@ -58,6 +67,7 @@ def parse_args():
def validate_args(args): def validate_args(args):
valid_inputs = { valid_inputs = {
"vllm": { "vllm": {
"device": ["cuda", "xpu"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -69,6 +79,7 @@ def validate_args(args): ...@@ -69,6 +79,7 @@ def validate_args(args):
"cuda_version": ["12.9", "13.0"], "cuda_version": ["12.9", "13.0"],
}, },
"trtllm": { "trtllm": {
"device": ["cuda"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -80,6 +91,7 @@ def validate_args(args): ...@@ -80,6 +91,7 @@ def validate_args(args):
"cuda_version": ["13.1"], "cuda_version": ["13.1"],
}, },
"sglang": { "sglang": {
"device": ["cuda"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -90,6 +102,7 @@ def validate_args(args): ...@@ -90,6 +102,7 @@ def validate_args(args):
"cuda_version": ["12.9", "13.0"], "cuda_version": ["12.9", "13.0"],
}, },
"dynamo": { "dynamo": {
"device": ["cuda"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -106,14 +119,16 @@ def validate_args(args): ...@@ -106,14 +119,16 @@ def validate_args(args):
if ( if (
args.target in valid_inputs[args.framework]["target"] args.target in valid_inputs[args.framework]["target"]
and args.cuda_version in valid_inputs[args.framework]["cuda_version"] and args.cuda_version in valid_inputs[args.framework]["cuda_version"]
and args.device in valid_inputs[args.framework]["device"]
): ):
return return
raise ValueError( raise ValueError(
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version}]" f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version},device={args.device}]"
) )
raise ValueError( raise ValueError(
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version}]" f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version},device={args.device}]"
) )
...@@ -128,6 +143,7 @@ def render(args, context, script_dir): ...@@ -128,6 +143,7 @@ def render(args, context, script_dir):
rendered = template.render( rendered = template.render(
context=context, context=context,
framework=args.framework, framework=args.framework,
device=args.device,
target=args.target, target=args.target,
platform=args.platform, platform=args.platform,
cuda_version=args.cuda_version, cuda_version=args.cuda_version,
...@@ -139,7 +155,7 @@ def render(args, context, script_dir): ...@@ -139,7 +155,7 @@ def render(args, context, script_dir):
if args.output_short_filename: if args.output_short_filename:
filename = "rendered.Dockerfile" filename = "rendered.Dockerfile"
else: else:
filename = f"{args.framework}-{args.target}-cuda{args.cuda_version}-{args.platform}-rendered.Dockerfile" filename = f"{args.framework}-{args.target}-{args.device}{args.cuda_version}-{args.platform}-rendered.Dockerfile"
with open(f"{script_dir}/{filename}", "w") as f: with open(f"{script_dir}/{filename}", "w") as f:
f.write(cleaned) f.write(cleaned)
...@@ -159,6 +175,9 @@ def render(args, context, script_dir): ...@@ -159,6 +175,9 @@ def render(args, context, script_dir):
def main(): def main():
args = parse_args() args = parse_args()
validate_args(args) validate_args(args)
# Clear cuda version for non-cuda device
if args.device != "cuda":
args.cuda_version = ""
script_dir = Path(__file__).parent script_dir = Path(__file__).parent
with open(f"{script_dir}/context.yaml", "r") as f: with open(f"{script_dir}/context.yaml", "r") as f:
context = yaml.safe_load(f) context = yaml.safe_load(f)
......
...@@ -18,21 +18,35 @@ ...@@ -18,21 +18,35 @@
#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg #TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg
ARG ARCH={{ platform }} ARG ARCH={{ platform }}
ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }} ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }}
ARG DEVICE={{ device }}
{% if device == "cuda" -%}
{% set device_key = device + cuda_version -%}
{% else -%}
{% set device_key = device -%}
{% endif %}
# Python/CUDA configuration # Python/CUDA configuration
ARG PYTHON_VERSION={{ context.dynamo.python_version }} ARG PYTHON_VERSION={{ context.dynamo.python_version }}
{% if device == "cuda" -%}
ARG CUDA_VERSION={{ cuda_version }} ARG CUDA_VERSION={{ cuda_version }}
ARG CUDA_MAJOR=${CUDA_VERSION%%.*} ARG CUDA_MAJOR=${CUDA_VERSION%%.*}
{% endif %}
# Base and runtime images configuration # Base and runtime images configuration
{% set cuda_context_key = "cuda" + cuda_version %} ARG BASE_IMAGE={{ context[framework][device_key].base_image }}
ARG BASE_IMAGE={{ context[framework].base_image }} ARG BASE_IMAGE_TAG={{ context[framework][device_key].base_image_tag }}
ARG BASE_IMAGE_TAG={{ context[framework][cuda_context_key].base_image_tag }}
{% if framework in ["sglang", "trtllm", "vllm"] -%} {% if framework in ["sglang", "trtllm", "vllm"] -%}
ARG RUNTIME_IMAGE={{ context[framework].runtime_image }} ARG RUNTIME_IMAGE={{ context[framework][device_key].runtime_image }}
ARG RUNTIME_IMAGE_TAG={{ context[framework][cuda_context_key].runtime_image_tag }} ARG RUNTIME_IMAGE_TAG={{ context[framework][device_key].runtime_image_tag }}
{%- endif %} {%- endif %}
# wheel builder image selection
{% if device == "xpu" %}
ARG WHEEL_BUILDER_IMAGE=${BASE_IMAGE}:${BASE_IMAGE_TAG}
{% else %}
ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_${ARCH_ALT}
{% endif %}
# Build configuration # Build configuration
ARG ENABLE_KVBM={{ context[framework].enable_kvbm }} ARG ENABLE_KVBM={{ context[framework].enable_kvbm }}
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -42,7 +56,9 @@ ARG ETCD_VERSION={{ context.dynamo.etcd_version }} ...@@ -42,7 +56,9 @@ ARG ETCD_VERSION={{ context.dynamo.etcd_version }}
ARG ENABLE_MEDIA_FFMPEG={{ context[framework].enable_media_ffmpeg }} ARG ENABLE_MEDIA_FFMPEG={{ context[framework].enable_media_ffmpeg }}
ARG FFMPEG_VERSION={{ context.dynamo.ffmpeg_version }} ARG FFMPEG_VERSION={{ context.dynamo.ffmpeg_version }}
{% if device == "cuda" -%}
ARG ENABLE_GPU_MEMORY_SERVICE={{ context[framework].enable_gpu_memory_service }} ARG ENABLE_GPU_MEMORY_SERVICE={{ context[framework].enable_gpu_memory_service }}
{% endif %}
# SCCACHE configuration # SCCACHE configuration
ARG USE_SCCACHE ARG USE_SCCACHE
...@@ -52,8 +68,10 @@ ARG SCCACHE_REGION="" ...@@ -52,8 +68,10 @@ ARG SCCACHE_REGION=""
# NIXL configuration # NIXL configuration
ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }} ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }}
ARG NIXL_REF={{ context.dynamo.nixl_ref }} ARG NIXL_REF={{ context.dynamo.nixl_ref }}
{% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }} ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }}
ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }} ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }}
{% endif %}
{% if target == "dev" or target == "local-dev" %} {% if target == "dev" or target == "local-dev" %}
ARG FRAMEWORK={{ framework }} ARG FRAMEWORK={{ framework }}
...@@ -66,19 +84,23 @@ ARG FRONTEND_IMAGE={{ context.dynamo.frontend_image }} ...@@ -66,19 +84,23 @@ ARG FRONTEND_IMAGE={{ context.dynamo.frontend_image }}
{% if framework == "vllm" -%} {% if framework == "vllm" -%}
# Make sure to update the dependency version in pyproject.toml when updating this # Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF={{ context.vllm.vllm_ref }} ARG VLLM_REF={{ context[framework][device_key].vllm_ref }}
ARG MAX_JOBS={{ context.vllm.max_jobs }} ARG MAX_JOBS={{ context.vllm.max_jobs }}
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
{% if device == "cuda" -%}
ARG FLASHINF_REF={{ context.vllm.flashinf_ref }} ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
{% endif %}
ARG LMCACHE_REF={{ context.vllm.lmcache_ref }} ARG LMCACHE_REF={{ context.vllm.lmcache_ref }}
ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }} ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
{% if device == "cuda" -%}
# If left blank, then we will fallback to vLLM defaults # If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF="" ARG DEEPGEMM_REF=""
# ModelExpress for P2P weight transfer (optional) # ModelExpress for P2P weight transfer (optional)
ARG ENABLE_MODELEXPRESS_P2P={{ context.vllm.enable_modelexpress_p2p }} ARG ENABLE_MODELEXPRESS_P2P={{ context.vllm.enable_modelexpress_p2p }}
ARG MODELEXPRESS_REF={{ context.vllm.modelexpress_ref }} ARG MODELEXPRESS_REF={{ context.vllm.modelexpress_ref }}
{% endif %}
{%- endif -%} {%- endif -%}
{% if framework == "trtllm" %} {% if framework == "trtllm" %}
......
...@@ -15,6 +15,7 @@ FROM runtime AS dynamo_tools ...@@ -15,6 +15,7 @@ FROM runtime AS dynamo_tools
ARG ARCH ARG ARCH
ARG ARCH_ALT ARG ARCH_ALT
ARG DEVICE
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV PATH=/usr/local/bin:${PATH} ENV PATH=/usr/local/bin:${PATH}
......
...@@ -15,6 +15,7 @@ FROM aws AS local-dev ...@@ -15,6 +15,7 @@ FROM aws AS local-dev
ENV USERNAME=dynamo ENV USERNAME=dynamo
ARG USER_UID ARG USER_UID
ARG USER_GID ARG USER_GID
ARG DEVICE
# rustup is already at /home/dynamo/.rustup from the dev stage (COPY --from=wheel_builder # rustup is already at /home/dynamo/.rustup from the dev stage (COPY --from=wheel_builder
# with --chown=dynamo:0 --chmod=775), so no re-copy needed here. # with --chown=dynamo:0 --chmod=775), so no re-copy needed here.
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# PURPOSE: Framework development and vLLM compilation # PURPOSE: Framework development and vLLM compilation
# #
# This stage builds and compiles framework dependencies including: # This stage builds and compiles framework dependencies including:
# - vLLM inference engine with CUDA support # - vLLM inference engine with CUDA/XPU support
# - DeepGEMM and FlashInfer optimizations # - DeepGEMM and FlashInfer optimizations
# - All necessary build tools and compilation dependencies # - All necessary build tools and compilation dependencies
# - Framework-level Python packages and extensions # - Framework-level Python packages and extensions
...@@ -27,6 +27,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework ...@@ -27,6 +27,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/ COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG DEVICE
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds. # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
...@@ -65,15 +66,27 @@ ARG ARCH ...@@ -65,15 +66,27 @@ ARG ARCH
# rebuilds from unrelated source code changes # rebuilds from unrelated source code changes
ARG VLLM_REF ARG VLLM_REF
ARG VLLM_GIT_URL ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG LMCACHE_REF ARG LMCACHE_REF
ARG VLLM_OMNI_REF ARG VLLM_OMNI_REF
{% if device == "cuda" %}
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG CUDA_VERSION ARG CUDA_VERSION
{% endif %}
ARG MAX_JOBS ARG MAX_JOBS
ENV MAX_JOBS=$MAX_JOBS ENV MAX_JOBS=$MAX_JOBS
{% if device == "cuda" %}
ENV CUDA_HOME=/usr/local/cuda ENV CUDA_HOME=/usr/local/cuda
{% endif %}
{% if device == "xpu" %}
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/vllm-xpu-v0.14.0.patch -O /tmp/vllm-xpu.patch
ENV VLLM_TARGET_DEVICE=xpu
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
{% endif %}
# Install VLLM and related dependencies # Install VLLM and related dependencies
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
...@@ -82,16 +95,19 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ ...@@ -82,16 +95,19 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \ cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \ chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh \ /tmp/install_vllm.sh \
--device $DEVICE \
--vllm-ref $VLLM_REF \ --vllm-ref $VLLM_REF \
--max-jobs $MAX_JOBS \ --max-jobs $MAX_JOBS \
--arch $ARCH \ --arch $ARCH \
--installation-dir /opt \ --installation-dir /opt \
${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \ ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \ ${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
--cuda-version $CUDA_VERSION ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${CUDA_VERSION:+--cuda-version "$CUDA_VERSION"}
{% if device == "cuda" %}
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH $LD_LIBRARY_PATH
{% endif %}
...@@ -24,10 +24,19 @@ ...@@ -24,10 +24,19 @@
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
ARG DEVICE
WORKDIR /workspace WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
{% if device == "xpu" %}
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics
{% endif %}
{% if device == "cuda" %}
# Set CUDA_DEVICE_ORDER to ensure CUDA logical device IDs match NVML physical device IDs # Set CUDA_DEVICE_ORDER to ensure CUDA logical device IDs match NVML physical device IDs
# This fixes NVML InvalidArgument errors when CUDA_VISIBLE_DEVICES is set # This fixes NVML InvalidArgument errors when CUDA_VISIBLE_DEVICES is set
ENV CUDA_DEVICE_ORDER=PCI_BUS_ID ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
...@@ -51,13 +60,19 @@ ENV CPATH=/usr/local/cuda/include \ ...@@ -51,13 +60,19 @@ ENV CPATH=/usr/local/cuda/include \
TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \ TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \ TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TRITON_CUDART_PATH=/usr/local/cuda/include TRITON_CUDART_PATH=/usr/local/cuda/include
{% endif %}
### COPY NATS & ETCD ### ### COPY NATS & ETCD ###
# Copy nats and etcd from dev image # Copy nats and etcd from dev image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/ COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
{% if device == "xpu" %}
ENV PATH=/usr/local/bin/etcd/:$PATH
{% else %}
# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible # Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
{% endif %}
# Copy uv to system /bin # Copy uv to system /bin
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/ COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
...@@ -82,8 +97,10 @@ ENV PYTHON_VERSION=${PYTHON_VERSION} ...@@ -82,8 +97,10 @@ ENV PYTHON_VERSION=${PYTHON_VERSION}
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds. # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt-get update && \ apt-get update && \
if [ "$DEVICE" = "cuda" ]; then \
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\ CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\
CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1) && \ CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1); \
fi && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Python runtime - CRITICAL for virtual environment to work # Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-dev \
...@@ -104,11 +121,39 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ...@@ -104,11 +121,39 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
# prometheus dependencies # prometheus dependencies
ca-certificates \ ca-certificates \
# opencv-python-headless (vLLM dependency) requires libxcb for some functions # opencv-python-headless (vLLM dependency) requires libxcb for some functions
libxcb1 \ libxcb1 && \
if [ "$DEVICE" = "cuda" ]; then \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image # DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR} && \ cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}; \
fi && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
{% if device == "xpu" %}
RUN apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
#ffmpeg \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
wget \
vim \
linux-libc-dev && \
# Install Intel GPU runtime packages
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing \
intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/intel-oneccl-2021.15.7.8_offline.sh && \
bash intel-oneccl-2021.15.7.8_offline.sh -a --silent --eula accept && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /etc/bash.bashrc && \
rm -f /opt/intel/oneapi/ccl/latest && \
ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest
{% endif %}
{% if context.vllm.enable_media_ffmpeg == "true" %} {% if context.vllm.enable_media_ffmpeg == "true" %}
# Copy ffmpeg libraries from wheel_builder (requires root, runs before USER dynamo) # Copy ffmpeg libraries from wheel_builder (requires root, runs before USER dynamo)
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
...@@ -124,9 +169,15 @@ ENV HOME=/home/dynamo ...@@ -124,9 +169,15 @@ ENV HOME=/home/dynamo
# This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands # This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands
SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"] SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
{% if device == "xpu" %}
ENV NIXL_PREFIX=/opt/intel/intel_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% else %}
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% endif %}
# Site-packages path derived from PYTHON_VERSION ARG # Site-packages path derived from PYTHON_VERSION ARG
ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
...@@ -138,15 +189,19 @@ ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages ...@@ -138,15 +189,19 @@ ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
# #
# Layer sizes (uncompressed): nvidia=4.5GB, flashinfer_jit_cache=4.1GB, torch=2.1GB, # Layer sizes (uncompressed): nvidia=4.5GB, flashinfer_jit_cache=4.1GB, torch=2.1GB,
# vllm=1.2GB, triton=592MB, flashinfer_cubin=437MB # vllm=1.2GB, triton=592MB, flashinfer_cubin=437MB
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SITE_PACKAGES}/nvidia COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SITE_PACKAGES}/nvidia
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
{% endif %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
{% if platform == "amd64" -%} {% if platform == "amd64" -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
{% endif -%} {% endif -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
{% endif %}
# Remaining packages and venv structure (bin/, include/, share/, etc.) # Remaining packages and venv structure (bin/, include/, share/, etc.)
COPY --chmod=775 --chown=dynamo:0 --from=framework \ COPY --chmod=775 --chown=dynamo:0 --from=framework \
--exclude=lib/python*/site-packages/nvidia \ --exclude=lib/python*/site-packages/nvidia \
...@@ -166,26 +221,37 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm ...@@ -166,26 +221,37 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm
# Copy UCX and NIXL to system directories (read-only, no group-write needed) # Copy UCX and NIXL to system directories (read-only, no group-write needed)
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
{% if device == "xpu" %}
COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/. ${NIXL_LIB_DIR}/
{% else %}
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/ COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
{% endif %}
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
{% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support) # Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
{% endif %}
ENV PATH=/usr/local/ucx/bin:$PATH ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$NIXL_LIB_DIR:\ $NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\ $NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\ /usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\ /usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH $LD_LIBRARY_PATH
{% if device == "cuda" %}
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH
ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
{% endif %}
# TODO: skip /workspace COPYs for dev/local-dev (bind-mounted from host, these get shadowed) # TODO: skip /workspace COPYs for dev/local-dev (bind-mounted from host, these get shadowed)
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/ COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
...@@ -226,6 +292,7 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \ ...@@ -226,6 +292,7 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
uv pip install /opt/dynamo/wheelhouse/nixl/nixl*.whl uv pip install /opt/dynamo/wheelhouse/nixl/nixl*.whl
{% endif %} {% endif %}
{% if device == "cuda" %}
# Install gpu_memory_service wheel if enabled (all targets) # Install gpu_memory_service wheel if enabled (all targets)
ARG ENABLE_GPU_MEMORY_SERVICE ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
...@@ -235,7 +302,6 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \ ...@@ -235,7 +302,6 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
if [ -n "$GMS_WHEEL" ]; then uv pip install "$GMS_WHEEL"; fi; \ if [ -n "$GMS_WHEEL" ]; then uv pip install "$GMS_WHEEL"; fi; \
fi fi
# Install ModelExpress for P2P weight transfer (optional) # Install ModelExpress for P2P weight transfer (optional)
ARG ENABLE_MODELEXPRESS_P2P ARG ENABLE_MODELEXPRESS_P2P
ARG MODELEXPRESS_REF ARG MODELEXPRESS_REF
...@@ -243,6 +309,7 @@ RUN if [ "${ENABLE_MODELEXPRESS_P2P}" = "true" ]; then \ ...@@ -243,6 +309,7 @@ RUN if [ "${ENABLE_MODELEXPRESS_P2P}" = "true" ]; then \
echo "Installing ModelExpress from ref: ${MODELEXPRESS_REF}" && \ echo "Installing ModelExpress from ref: ${MODELEXPRESS_REF}" && \
uv pip install "modelexpress @ git+https://github.com/ai-dynamo/modelexpress.git@${MODELEXPRESS_REF}#subdirectory=modelexpress_client/python"; \ uv pip install "modelexpress @ git+https://github.com/ai-dynamo/modelexpress.git@${MODELEXPRESS_REF}#subdirectory=modelexpress_client/python"; \
fi fi
{% endif %}
# Install common and test dependencies. Cache uv downloads; uv handles its own locking for this cache. # Install common and test dependencies. Cache uv downloads; uv handles its own locking for this cache.
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
...@@ -274,6 +341,7 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} & ...@@ -274,6 +341,7 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} &
echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \ echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
{% if device == "cuda" %}
# Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks) # Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks)
# This recreates proper symlinks to save space and suppress ldconfig warnings # This recreates proper symlinks to save space and suppress ldconfig warnings
RUN cd /usr/local/lib && \ RUN cd /usr/local/lib && \
...@@ -304,15 +372,25 @@ RUN cd /usr/local/lib && \ ...@@ -304,15 +372,25 @@ RUN cd /usr/local/lib && \
fi; \ fi; \
done && \ done && \
ldconfig ldconfig
{% endif %}
USER dynamo USER dynamo
ARG DYNAMO_COMMIT_SHA ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
{% if device == "xpu" %}
RUN uv pip uninstall triton triton-xpu && \
uv pip install triton-xpu==3.6.0 --extra-index-url=https://download.pytorch.org/whl/test/xpu && \
uv pip uninstall oneccl && \
uv pip uninstall oneccl-devel
SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /etc/bash.bashrc && exec bash"]
{% else %}
# In vLLM 0.12 the default sampler changed on the forward pass. # In vLLM 0.12 the default sampler changed on the forward pass.
# We need to enable this to enable the cuda kernels. # We need to enable this to enable the cuda kernels.
ENV VLLM_USE_FLASHINFER_SAMPLER=1 ENV VLLM_USE_FLASHINFER_SAMPLER=1
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
{% endif %}
...@@ -7,21 +7,21 @@ ...@@ -7,21 +7,21 @@
##### Wheel Build Image ########## ##### Wheel Build Image ##########
################################## ##################################
# Redeclare ARCH_ALT ARG so it's available for interpolation in the FROM instruction FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder
ARG ARCH_ALT
FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder
# Redeclare ARGs for this stage # Redeclare ARGs for this stage
ARG ARCH ARG ARCH
ARG ARCH_ALT ARG ARCH_ALT
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
ARG DEVICE
WORKDIR /workspace WORKDIR /workspace
{% if device == "cuda" %}
# Copy CUDA from base stage # Copy CUDA from base stage
COPY --from=dynamo_base /usr/local/cuda /usr/local/cuda COPY --from=dynamo_base /usr/local/cuda /usr/local/cuda
COPY --from=dynamo_base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf COPY --from=dynamo_base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf
{% endif %}
# Set environment variables first so they can be used in COPY commands # Set environment variables first so they can be used in COPY commands
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
...@@ -34,6 +34,71 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \ ...@@ -34,6 +34,71 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
{% if device == "xpu" %}
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/ucx-v1.12.0.patch -O /tmp/ucx.patch
RUN apt clean && apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
#ffmpeg \
ca-certificates \
zip \
unzip \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
libaio-dev \
numactl \
wget \
vim \
linux-libc-dev && \
# Install Intel GPU runtime packages
apt update -y && apt upgrade -y && \
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd \
libze-intel-gpu-raytracing intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# NIXL build dependencies
autoconf \
automake \
cmake \
git-lfs \
libtool \
meson \
net-tools \
ninja-build \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall --no-install-recommends \
libibverbs-dev \
rdma-core \
ibverbs-utils \
libibumad-dev \
libnuma-dev \
librdmacm-dev \
ibverbs-providers \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
{% endif %}
{% if device == "cuda" %}
# Install system dependencies # Install system dependencies
# Cache dnf downloads; sharing=locked avoids dnf/rpm races with concurrent builds. # Cache dnf downloads; sharing=locked avoids dnf/rpm races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/dnf,sharing=locked \ RUN --mount=type=cache,target=/var/cache/dnf,sharing=locked \
...@@ -84,7 +149,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \ ...@@ -84,7 +149,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \
LD_LIBRARY_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64:${LD_LIBRARY_PATH}" \
CC="/opt/rh/gcc-toolset-14/root/usr/bin/gcc" \ CC="/opt/rh/gcc-toolset-14/root/usr/bin/gcc" \
CXX="/opt/rh/gcc-toolset-14/root/usr/bin/g++" CXX="/opt/rh/gcc-toolset-14/root/usr/bin/g++"
{% endif %}
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional) # Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \ RUN set -eux; \
...@@ -104,10 +169,16 @@ RUN set -eux; \ ...@@ -104,10 +169,16 @@ RUN set -eux; \
# Point build tools explicitly at the modern protoc # Point build tools explicitly at the modern protoc
ENV PROTOC=/usr/local/bin/protoc ENV PROTOC=/usr/local/bin/protoc
{% if device == "xpu" %}
# Install uv package manager
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH
{% else %}
ENV CUDA_PATH=/usr/local/cuda \ ENV CUDA_PATH=/usr/local/cuda \
PATH=/usr/local/cuda/bin:$PATH \ PATH=/usr/local/cuda/bin:$PATH \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \ LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \
NVIDIA_DRIVER_CAPABILITIES=video,compute,utility NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
{% endif %}
# Create virtual environment for building wheels # Create virtual environment for building wheels
ARG PYTHON_VERSION ARG PYTHON_VERSION
...@@ -120,6 +191,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -120,6 +191,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ARG NIXL_UCX_REF ARG NIXL_UCX_REF
ARG NIXL_REF ARG NIXL_REF
{% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF ARG NIXL_GDRCOPY_REF
# Build and install gdrcopy # Build and install gdrcopy
...@@ -129,6 +202,7 @@ RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/g ...@@ -129,6 +202,7 @@ RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/g
rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \ rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \ rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \
rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm
{% endif %}
# sccache binary is pre-installed in dynamo_base; stage it off-PATH so # sccache binary is pre-installed in dynamo_base; stage it off-PATH so
# Meson doesn't auto-detect it as a CUDA compiler launcher # Meson doesn't auto-detect it as a CUDA compiler launcher
...@@ -159,7 +233,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -159,7 +233,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
dnf install -y pkg-config && \ if [ "$DEVICE" = "xpu" ]; then \
apt-get update -y && apt-get install -y pkg-config; \
apt-get clean && rm -rf /var/lib/apt/lists/*; \
elif [ "$DEVICE" = "cuda" ]; then \
dnf install -y pkg-config; \
fi && \
cd /tmp && \ cd /tmp && \
curl -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \ curl -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \
tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \ tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \
...@@ -198,7 +277,26 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -198,7 +277,26 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
git clone https://github.com/openucx/ucx.git && \ git clone https://github.com/openucx/ucx.git && \
cd ucx && \ cd ucx && \
git checkout $NIXL_UCX_REF && \ git checkout $NIXL_UCX_REF && \
if [ "$DEVICE" = "xpu" ]; then \
git apply --ignore-whitespace /tmp/ucx.patch; \
fi && \
./autogen.sh && \ ./autogen.sh && \
if [ "$DEVICE" = "xpu" ]; then \
./contrib/configure-release \
--prefix=/usr/local/ucx \
--with-ze \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-verbs \
--with-dm \
--with-efa \
--without-cuda \
--enable-mt; \
elif [ "$DEVICE" = "cuda" ]; then \
./contrib/configure-release \ ./contrib/configure-release \
--prefix=/usr/local/ucx \ --prefix=/usr/local/ucx \
--enable-shared \ --enable-shared \
...@@ -212,7 +310,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -212,7 +310,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--with-dm \ --with-dm \
--with-gdrcopy=/usr/local \ --with-gdrcopy=/usr/local \
--with-efa \ --with-efa \
--enable-mt && \ --enable-mt; \
fi && \
make -j && \ make -j && \
make -j install-strip && \ make -j install-strip && \
/tmp/use-sccache.sh show-stats "UCX" && \ /tmp/use-sccache.sh show-stats "UCX" && \
...@@ -220,6 +319,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -220,6 +319,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \ echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
ldconfig ldconfig
{% if device == "cuda" %}
ARG NIXL_LIBFABRIC_REF ARG NIXL_LIBFABRIC_REF
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
...@@ -248,8 +348,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -248,8 +348,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
/tmp/use-sccache.sh show-stats "LIBFABRIC" && \ /tmp/use-sccache.sh show-stats "LIBFABRIC" && \
echo "/usr/local/libfabric/lib" > /etc/ld.so.conf.d/libfabric.conf && \ echo "/usr/local/libfabric/lib" > /etc/ld.so.conf.d/libfabric.conf && \
ldconfig ldconfig
{% endif %}
{% if framework == "vllm" %} {% if framework == "vllm" and device == "cuda" %}
# Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support) # Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support)
ARG AWS_SDK_CPP_VERSION=1.11.581 ARG AWS_SDK_CPP_VERSION=1.11.581
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
...@@ -277,7 +378,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -277,7 +378,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
{% endif %} {% endif %}
# build and install nixl # build and install nixl
{% if device == "cuda" %}
ARG CUDA_MAJOR ARG CUDA_MAJOR
{% endif %}
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \ export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
...@@ -288,22 +392,38 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -288,22 +392,38 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
git clone "https://github.com/ai-dynamo/nixl.git" && \ git clone "https://github.com/ai-dynamo/nixl.git" && \
cd nixl && \ cd nixl && \
git checkout ${NIXL_REF} && \ git checkout ${NIXL_REF} && \
PKG_NAME="nixl-cu${CUDA_MAJOR}" && \ if [ "$DEVICE" = "cuda" ]; then \
PKG_NAME="nixl-cu${CUDA_MAJOR}"; \
elif [ "$DEVICE" = "xpu" ]; then \
PKG_NAME="nixl-xpu"; \
fi && \
./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml && \ ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml && \
mkdir build && \ mkdir build && \
if [ "$DEVICE" = "cuda" ]; then \
meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \ meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
-Dcudapath_lib="/usr/local/cuda/lib64" \ -Dcudapath_lib="/usr/local/cuda/lib64" \
-Dcudapath_inc="/usr/local/cuda/include" \ -Dcudapath_inc="/usr/local/cuda/include" \
-Ducx_path="/usr/local/ucx" \ -Ducx_path="/usr/local/ucx" \
-Dlibfabric_path="/usr/local/libfabric" && \ -Dlibfabric_path="/usr/local/libfabric"; \
elif [ "$DEVICE" = "xpu" ]; then \
meson setup build/ --prefix=/opt/intel/intel_nixl --buildtype=release \
-Ducx_path="/usr/local/ucx"; \
fi && \
cd build && \ cd build && \
ninja && \ ninja && \
ninja install && \ ninja install && \
/tmp/use-sccache.sh show-stats "NIXL" /tmp/use-sccache.sh show-stats "NIXL"
{% if device == "xpu" %}
ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \
NIXL_PREFIX=/opt/intel/intel_nixl
{% else %}
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \ ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \ NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
NIXL_PREFIX=/opt/nvidia/nvda_nixl NIXL_PREFIX=/opt/nvidia/nvda_nixl
{% endif %}
ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH}
RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \ RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
...@@ -355,9 +475,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -355,9 +475,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
else \ else \
maturin build --release --out /opt/dynamo/dist; \ maturin build --release --out /opt/dynamo/dist; \
fi && \ fi && \
if [ "$ENABLE_KVBM" == "true" ]; then \ if [ "$ENABLE_KVBM" = "true" ]; then \
cd /opt/dynamo/lib/bindings/kvbm && \ cd /opt/dynamo/lib/bindings/kvbm && \
maturin build --release --out target/wheels && \ maturin build --release --out target/wheels && \
if [ "$DEVICE" = "cuda" ]; then \
auditwheel repair \ auditwheel repair \
--exclude libnixl.so \ --exclude libnixl.so \
--exclude libnixl_build.so \ --exclude libnixl_build.so \
...@@ -366,10 +487,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -366,10 +487,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--plat manylinux_2_28_${ARCH_ALT} \ --plat manylinux_2_28_${ARCH_ALT} \
--wheel-dir /opt/dynamo/dist \ --wheel-dir /opt/dynamo/dist \
target/wheels/*.whl; \ target/wheels/*.whl; \
elif [ "$DEVICE" = "xpu" ]; then \
cp target/wheels/*.whl /opt/dynamo/dist/; \
fi; \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
{% else %} {% else %}
# Dev/local-dev targets do not have pre-built wheels or /workspace source code. # Dev/local-dev targets do not have pre-built wheels or /workspace source code.
# After you start the local-dev/dev container, you will need to build from source: # After you start the local-dev/dev container, you will need to build from source:
...@@ -387,6 +510,8 @@ COPY lib/gpu_memory_service/ /opt/dynamo/lib/gpu_memory_service/ ...@@ -387,6 +510,8 @@ COPY lib/gpu_memory_service/ /opt/dynamo/lib/gpu_memory_service/
{% endif %} {% endif %}
# Build gpu-memory-service wheel → /opt/dynamo/dist/gpu_memory_service*.whl (small C++ extension, fast build -- all targets, all frameworks) # Build gpu-memory-service wheel → /opt/dynamo/dist/gpu_memory_service*.whl (small C++ extension, fast build -- all targets, all frameworks)
{% if device == "cuda" %}
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \ if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
...@@ -394,3 +519,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -394,3 +519,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
source ${VIRTUAL_ENV}/bin/activate && \ source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \ uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi fi
{% endif %}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment