Unverified Commit e0a2e7bb authored by zxue2's avatar zxue2 Committed by GitHub
Browse files

feat: enable intel xpu dockerfile (#6109)


Signed-off-by: default avatarZhan Xue <zhan.xue@intel.com>
Co-authored-by: default avatarXinYu Ye <xinyu.ye@intel.com>
Co-authored-by: default avatarHongming Zheng <hongming.zheng@intel.com>
parent ec63ff72
...@@ -10,10 +10,11 @@ ...@@ -10,10 +10,11 @@
# when building. # when building.
dynamo: dynamo:
base_image: nvcr.io/nvidia/cuda-dl-base
cuda12.9: cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
epp_image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1 epp_image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1
frontend_image: nvcr.io/nvidia/base/ubuntu:noble-20250619 frontend_image: nvcr.io/nvidia/base/ubuntu:noble-20250619
...@@ -34,15 +35,24 @@ dynamo: ...@@ -34,15 +35,24 @@ dynamo:
efa_version: 1.45.1 efa_version: 1.45.1
vllm: vllm:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
cuda12.9: cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: 12.9.1-runtime-ubuntu24.04 runtime_image_tag: 12.9.1-runtime-ubuntu24.04
vllm_ref: v0.16.0
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04 runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.16.0 vllm_ref: v0.16.0
xpu:
base_image: intel/deep-learning-essentials
runtime_image: intel/deep-learning-essentials
base_image_tag: 2025.3.2-0-devel-ubuntu24.04
runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
vllm_ref: v0.14.0
flashinf_ref: v0.6.3 flashinf_ref: v0.6.3
lmcache_ref: 0.3.14 lmcache_ref: 0.3.14
vllm_omni_ref: "v0.16.0rc1" vllm_omni_ref: "v0.16.0rc1"
...@@ -54,12 +64,14 @@ vllm: ...@@ -54,12 +64,14 @@ vllm:
modelexpress_ref: "3d73992ce6c10e52ddc54f7f12af35d27e173f15" modelexpress_ref: "3d73992ce6c10e52ddc54f7f12af35d27e173f15"
sglang: sglang:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang
cuda12.9: cuda12.9:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: v0.5.9-runtime runtime_image_tag: v0.5.9-runtime
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: v0.5.9-cu130-runtime runtime_image_tag: v0.5.9-cu130-runtime
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
...@@ -67,9 +79,9 @@ sglang: ...@@ -67,9 +79,9 @@ sglang:
enable_kvbm: "false" enable_kvbm: "false"
trtllm: trtllm:
base_image: nvcr.io/nvidia/pytorch
runtime_image: nvcr.io/nvidia/cuda-dl-base
cuda13.1: cuda13.1:
base_image: nvcr.io/nvidia/pytorch
runtime_image: nvcr.io/nvidia/cuda-dl-base
base_image_tag: 25.12-py3 base_image_tag: 25.12-py3
runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04 runtime_image_tag: 25.12-cuda13.1-runtime-ubuntu24.04
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
......
...@@ -14,6 +14,7 @@ set -euo pipefail ...@@ -14,6 +14,7 @@ set -euo pipefail
VLLM_VER="0.16.0" VLLM_VER="0.16.0"
VLLM_REF="v${VLLM_VER}" VLLM_REF="v${VLLM_VER}"
DEVICE="cuda"
# Basic Configurations # Basic Configurations
ARCH=$(uname -m) ARCH=$(uname -m)
...@@ -30,6 +31,10 @@ VLLM_OMNI_REF="v0.16.0rc1" ...@@ -30,6 +31,10 @@ VLLM_OMNI_REF="v0.16.0rc1"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
--device)
DEVICE="$2"
shift 2
;;
--vllm-ref) --vllm-ref)
VLLM_REF="$2" VLLM_REF="$2"
shift 2 shift 2
...@@ -71,8 +76,9 @@ while [[ $# -gt 0 ]]; do ...@@ -71,8 +76,9 @@ while [[ $# -gt 0 ]]; do
shift 2 shift 2
;; ;;
-h|--help) -h|--help)
echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]" echo "Usage: $0 [--device DEVICE] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Options:" echo "Options:"
echo " --device DEVICE Device Selection (default: cuda)"
echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})" echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})" echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})"
echo " --arch ARCH Architecture amd64|arm64 (default: auto-detect)" echo " --arch ARCH Architecture amd64|arm64 (default: auto-detect)"
...@@ -107,35 +113,50 @@ elif [ "$ARCH" = "arm64" ]; then ...@@ -107,35 +113,50 @@ elif [ "$ARCH" = "arm64" ]; then
fi fi
export MAX_JOBS=$MAX_JOBS export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda if [ "$DEVICE" = "cuda" ]; then
export CUDA_HOME=/usr/local/cuda
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129") # Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')" TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
echo "=== Installing prerequisites ===" echo "=== Installing prerequisites ==="
uv pip install pip cuda-python uv pip install pip cuda-python
fi
echo "\n=== Configuration Summary ===" if [ "$DEVICE" = "cuda" ]; then
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND" echo "\n=== Configuration Summary ==="
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR" echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
elif [ "$DEVICE" = "xpu" ]; then
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
fi
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if [ "$DEVICE" = "cuda" ]; then
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF" if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
else
echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi
elif [ "$DEVICE" = "xpu" ]; then
echo " LMCACHE_REF=$LMCACHE_REF "
echo "\n=== Installing LMCache ===" echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues uv pip install lmcache==${LMCACHE_REF}
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed" echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi fi
else
echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi fi
echo "\n=== Cloning vLLM repository ===" echo "\n=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts # Clone needed for DeepGEMM and EP kernels install scripts
cd $INSTALLATION_DIR cd $INSTALLATION_DIR
...@@ -144,48 +165,56 @@ cd vllm ...@@ -144,48 +165,56 @@ cd vllm
git checkout $VLLM_REF git checkout $VLLM_REF
echo "✓ vLLM repository cloned" echo "✓ vLLM repository cloned"
if [ "$DEVICE" = "xpu" ]; then
echo "\n=== Installing vLLM ==="
git apply --ignore-whitespace /tmp/vllm-xpu.patch
uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
uv pip install --verbose --no-build-isolation .
fi
echo "\n=== Installing vLLM & FlashInfer ===" if [ "$DEVICE" = "cuda" ]; then
echo "\n=== Installing vLLM & FlashInfer ==="
# Build GitHub release wheel URL per CUDA version # Build GitHub release wheel URL per CUDA version
# CUDA 12 wheels have no +cu suffix and use manylinux_2_31 # CUDA 12 wheels have no +cu suffix and use manylinux_2_31
# CUDA 13 wheels have +cu130 suffix and use manylinux_2_35 # CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl" VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS="" EXTRA_PIP_ARGS=""
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl" VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}" EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
else else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}" echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1 exit 1
fi fi
VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}" VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"
# Install vLLM wheel # Install vLLM wheel
# CUDA 12: Try PyPI first, fall back to GitHub release # CUDA 12: Try PyPI first, fall back to GitHub release
# CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend # CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
# does not prevent uv from resolving the cu12 variant) # does not prevent uv from resolving the cu12 variant)
echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..." echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
echo "✓ vLLM ${VLLM_VER} installed from PyPI" echo "✓ vLLM ${VLLM_VER} installed from PyPI"
else
echo "⚠ PyPI install failed, installing from GitHub release..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
else else
echo "⚠ PyPI install failed, installing from GitHub release..." echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
uv pip install ${EXTRA_PIP_ARGS} \ uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \ "${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND} --torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub" echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi fi
else uv pip install flashinfer-cubin==$FLASHINF_REF
echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..." uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi fi
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed" echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM-Omni ===" echo "\n=== Installing vLLM-Omni ==="
...@@ -210,18 +239,19 @@ else ...@@ -210,18 +239,19 @@ else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)" echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi fi
echo "\n=== Installing DeepGEMM ===" if [ "$DEVICE" = "cuda" ]; then
cd $INSTALLATION_DIR/vllm/tools echo "\n=== Installing DeepGEMM ==="
if [ -n "$DEEPGEMM_REF" ]; then cd $INSTALLATION_DIR/vllm/tools
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF" if [ -n "$DEEPGEMM_REF" ]; then
else bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" else
fi bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
echo "✓ DeepGEMM installation completed" fi
echo "✓ DeepGEMM installation completed"
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
fi
echo "\n✅ All installations completed successfully!" echo "\n✅ All installations completed successfully!"
...@@ -21,6 +21,15 @@ def parse_args(): ...@@ -21,6 +21,15 @@ def parse_args():
choices=["dynamo", "vllm", "sglang", "trtllm"], choices=["dynamo", "vllm", "sglang", "trtllm"],
help="Dockerfile framework to use", help="Dockerfile framework to use",
) )
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda", "xpu"],
help="Dockerfile device to use",
)
parser.add_argument( parser.add_argument(
"--target", "--target",
type=str, type=str,
...@@ -58,6 +67,7 @@ def parse_args(): ...@@ -58,6 +67,7 @@ def parse_args():
def validate_args(args): def validate_args(args):
valid_inputs = { valid_inputs = {
"vllm": { "vllm": {
"device": ["cuda", "xpu"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -69,6 +79,7 @@ def validate_args(args): ...@@ -69,6 +79,7 @@ def validate_args(args):
"cuda_version": ["12.9", "13.0"], "cuda_version": ["12.9", "13.0"],
}, },
"trtllm": { "trtllm": {
"device": ["cuda"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -80,6 +91,7 @@ def validate_args(args): ...@@ -80,6 +91,7 @@ def validate_args(args):
"cuda_version": ["13.1"], "cuda_version": ["13.1"],
}, },
"sglang": { "sglang": {
"device": ["cuda"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -90,6 +102,7 @@ def validate_args(args): ...@@ -90,6 +102,7 @@ def validate_args(args):
"cuda_version": ["12.9", "13.0"], "cuda_version": ["12.9", "13.0"],
}, },
"dynamo": { "dynamo": {
"device": ["cuda"],
"target": [ "target": [
"runtime", "runtime",
"dev", "dev",
...@@ -106,14 +119,16 @@ def validate_args(args): ...@@ -106,14 +119,16 @@ def validate_args(args):
if ( if (
args.target in valid_inputs[args.framework]["target"] args.target in valid_inputs[args.framework]["target"]
and args.cuda_version in valid_inputs[args.framework]["cuda_version"] and args.cuda_version in valid_inputs[args.framework]["cuda_version"]
and args.device in valid_inputs[args.framework]["device"]
): ):
return return
raise ValueError( raise ValueError(
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version}]" f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version},device={args.device}]"
) )
raise ValueError( raise ValueError(
f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version}]" f"Invalid input combination: [framework={args.framework},target={args.target},cuda_version={args.cuda_version},device={args.device}]"
) )
...@@ -128,6 +143,7 @@ def render(args, context, script_dir): ...@@ -128,6 +143,7 @@ def render(args, context, script_dir):
rendered = template.render( rendered = template.render(
context=context, context=context,
framework=args.framework, framework=args.framework,
device=args.device,
target=args.target, target=args.target,
platform=args.platform, platform=args.platform,
cuda_version=args.cuda_version, cuda_version=args.cuda_version,
...@@ -139,7 +155,7 @@ def render(args, context, script_dir): ...@@ -139,7 +155,7 @@ def render(args, context, script_dir):
if args.output_short_filename: if args.output_short_filename:
filename = "rendered.Dockerfile" filename = "rendered.Dockerfile"
else: else:
filename = f"{args.framework}-{args.target}-cuda{args.cuda_version}-{args.platform}-rendered.Dockerfile" filename = f"{args.framework}-{args.target}-{args.device}{args.cuda_version}-{args.platform}-rendered.Dockerfile"
with open(f"{script_dir}/{filename}", "w") as f: with open(f"{script_dir}/{filename}", "w") as f:
f.write(cleaned) f.write(cleaned)
...@@ -159,6 +175,9 @@ def render(args, context, script_dir): ...@@ -159,6 +175,9 @@ def render(args, context, script_dir):
def main(): def main():
args = parse_args() args = parse_args()
validate_args(args) validate_args(args)
# Clear cuda version for non-cuda device
if args.device != "cuda":
args.cuda_version = ""
script_dir = Path(__file__).parent script_dir = Path(__file__).parent
with open(f"{script_dir}/context.yaml", "r") as f: with open(f"{script_dir}/context.yaml", "r") as f:
context = yaml.safe_load(f) context = yaml.safe_load(f)
......
...@@ -18,21 +18,35 @@ ...@@ -18,21 +18,35 @@
#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg #TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg
ARG ARCH={{ platform }} ARG ARCH={{ platform }}
ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }} ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }}
ARG DEVICE={{ device }}
{% if device == "cuda" -%}
{% set device_key = device + cuda_version -%}
{% else -%}
{% set device_key = device -%}
{% endif %}
# Python/CUDA configuration # Python/CUDA configuration
ARG PYTHON_VERSION={{ context.dynamo.python_version }} ARG PYTHON_VERSION={{ context.dynamo.python_version }}
{% if device == "cuda" -%}
ARG CUDA_VERSION={{ cuda_version }} ARG CUDA_VERSION={{ cuda_version }}
ARG CUDA_MAJOR=${CUDA_VERSION%%.*} ARG CUDA_MAJOR=${CUDA_VERSION%%.*}
{% endif %}
# Base and runtime images configuration # Base and runtime images configuration
{% set cuda_context_key = "cuda" + cuda_version %} ARG BASE_IMAGE={{ context[framework][device_key].base_image }}
ARG BASE_IMAGE={{ context[framework].base_image }} ARG BASE_IMAGE_TAG={{ context[framework][device_key].base_image_tag }}
ARG BASE_IMAGE_TAG={{ context[framework][cuda_context_key].base_image_tag }}
{% if framework in ["sglang", "trtllm", "vllm"] -%} {% if framework in ["sglang", "trtllm", "vllm"] -%}
ARG RUNTIME_IMAGE={{ context[framework].runtime_image }} ARG RUNTIME_IMAGE={{ context[framework][device_key].runtime_image }}
ARG RUNTIME_IMAGE_TAG={{ context[framework][cuda_context_key].runtime_image_tag }} ARG RUNTIME_IMAGE_TAG={{ context[framework][device_key].runtime_image_tag }}
{%- endif %} {%- endif %}
# wheel builder image selection
{% if device == "xpu" %}
ARG WHEEL_BUILDER_IMAGE=${BASE_IMAGE}:${BASE_IMAGE_TAG}
{% else %}
ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_${ARCH_ALT}
{% endif %}
# Build configuration # Build configuration
ARG ENABLE_KVBM={{ context[framework].enable_kvbm }} ARG ENABLE_KVBM={{ context[framework].enable_kvbm }}
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -42,7 +56,9 @@ ARG ETCD_VERSION={{ context.dynamo.etcd_version }} ...@@ -42,7 +56,9 @@ ARG ETCD_VERSION={{ context.dynamo.etcd_version }}
ARG ENABLE_MEDIA_FFMPEG={{ context[framework].enable_media_ffmpeg }} ARG ENABLE_MEDIA_FFMPEG={{ context[framework].enable_media_ffmpeg }}
ARG FFMPEG_VERSION={{ context.dynamo.ffmpeg_version }} ARG FFMPEG_VERSION={{ context.dynamo.ffmpeg_version }}
{% if device == "cuda" -%}
ARG ENABLE_GPU_MEMORY_SERVICE={{ context[framework].enable_gpu_memory_service }} ARG ENABLE_GPU_MEMORY_SERVICE={{ context[framework].enable_gpu_memory_service }}
{% endif %}
# SCCACHE configuration # SCCACHE configuration
ARG USE_SCCACHE ARG USE_SCCACHE
...@@ -52,8 +68,10 @@ ARG SCCACHE_REGION="" ...@@ -52,8 +68,10 @@ ARG SCCACHE_REGION=""
# NIXL configuration # NIXL configuration
ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }} ARG NIXL_UCX_REF={{ context.dynamo.nixl_ucx_ref }}
ARG NIXL_REF={{ context.dynamo.nixl_ref }} ARG NIXL_REF={{ context.dynamo.nixl_ref }}
{% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }} ARG NIXL_GDRCOPY_REF={{ context.dynamo.nixl_gdrcopy_ref }}
ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }} ARG NIXL_LIBFABRIC_REF={{ context.dynamo.nixl_libfabric_ref }}
{% endif %}
{% if target == "dev" or target == "local-dev" %} {% if target == "dev" or target == "local-dev" %}
ARG FRAMEWORK={{ framework }} ARG FRAMEWORK={{ framework }}
...@@ -66,19 +84,23 @@ ARG FRONTEND_IMAGE={{ context.dynamo.frontend_image }} ...@@ -66,19 +84,23 @@ ARG FRONTEND_IMAGE={{ context.dynamo.frontend_image }}
{% if framework == "vllm" -%} {% if framework == "vllm" -%}
# Make sure to update the dependency version in pyproject.toml when updating this # Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF={{ context.vllm.vllm_ref }} ARG VLLM_REF={{ context[framework][device_key].vllm_ref }}
ARG MAX_JOBS={{ context.vllm.max_jobs }} ARG MAX_JOBS={{ context.vllm.max_jobs }}
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
{% if device == "cuda" -%}
ARG FLASHINF_REF={{ context.vllm.flashinf_ref }} ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
{% endif %}
ARG LMCACHE_REF={{ context.vllm.lmcache_ref }} ARG LMCACHE_REF={{ context.vllm.lmcache_ref }}
ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }} ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
{% if device == "cuda" -%}
# If left blank, then we will fallback to vLLM defaults # If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF="" ARG DEEPGEMM_REF=""
# ModelExpress for P2P weight transfer (optional) # ModelExpress for P2P weight transfer (optional)
ARG ENABLE_MODELEXPRESS_P2P={{ context.vllm.enable_modelexpress_p2p }} ARG ENABLE_MODELEXPRESS_P2P={{ context.vllm.enable_modelexpress_p2p }}
ARG MODELEXPRESS_REF={{ context.vllm.modelexpress_ref }} ARG MODELEXPRESS_REF={{ context.vllm.modelexpress_ref }}
{% endif %}
{%- endif -%} {%- endif -%}
{% if framework == "trtllm" %} {% if framework == "trtllm" %}
......
...@@ -15,6 +15,7 @@ FROM runtime AS dynamo_tools ...@@ -15,6 +15,7 @@ FROM runtime AS dynamo_tools
ARG ARCH ARG ARCH
ARG ARCH_ALT ARG ARCH_ALT
ARG DEVICE
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV PATH=/usr/local/bin:${PATH} ENV PATH=/usr/local/bin:${PATH}
......
...@@ -15,6 +15,7 @@ FROM aws AS local-dev ...@@ -15,6 +15,7 @@ FROM aws AS local-dev
ENV USERNAME=dynamo ENV USERNAME=dynamo
ARG USER_UID ARG USER_UID
ARG USER_GID ARG USER_GID
ARG DEVICE
# rustup is already at /home/dynamo/.rustup from the dev stage (COPY --from=wheel_builder # rustup is already at /home/dynamo/.rustup from the dev stage (COPY --from=wheel_builder
# with --chown=dynamo:0 --chmod=775), so no re-copy needed here. # with --chown=dynamo:0 --chmod=775), so no re-copy needed here.
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# PURPOSE: Framework development and vLLM compilation # PURPOSE: Framework development and vLLM compilation
# #
# This stage builds and compiles framework dependencies including: # This stage builds and compiles framework dependencies including:
# - vLLM inference engine with CUDA support # - vLLM inference engine with CUDA/XPU support
# - DeepGEMM and FlashInfer optimizations # - DeepGEMM and FlashInfer optimizations
# - All necessary build tools and compilation dependencies # - All necessary build tools and compilation dependencies
# - Framework-level Python packages and extensions # - Framework-level Python packages and extensions
...@@ -27,6 +27,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework ...@@ -27,6 +27,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/ COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG DEVICE
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds. # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
...@@ -65,15 +66,27 @@ ARG ARCH ...@@ -65,15 +66,27 @@ ARG ARCH
# rebuilds from unrelated source code changes # rebuilds from unrelated source code changes
ARG VLLM_REF ARG VLLM_REF
ARG VLLM_GIT_URL ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG LMCACHE_REF ARG LMCACHE_REF
ARG VLLM_OMNI_REF ARG VLLM_OMNI_REF
{% if device == "cuda" %}
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG CUDA_VERSION ARG CUDA_VERSION
{% endif %}
ARG MAX_JOBS ARG MAX_JOBS
ENV MAX_JOBS=$MAX_JOBS ENV MAX_JOBS=$MAX_JOBS
{% if device == "cuda" %}
ENV CUDA_HOME=/usr/local/cuda ENV CUDA_HOME=/usr/local/cuda
{% endif %}
{% if device == "xpu" %}
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/vllm-xpu-v0.14.0.patch -O /tmp/vllm-xpu.patch
ENV VLLM_TARGET_DEVICE=xpu
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
{% endif %}
# Install VLLM and related dependencies # Install VLLM and related dependencies
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
...@@ -82,16 +95,19 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ ...@@ -82,16 +95,19 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \ cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \ chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh \ /tmp/install_vllm.sh \
--device $DEVICE \
--vllm-ref $VLLM_REF \ --vllm-ref $VLLM_REF \
--max-jobs $MAX_JOBS \ --max-jobs $MAX_JOBS \
--arch $ARCH \ --arch $ARCH \
--installation-dir /opt \ --installation-dir /opt \
${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \ ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \ ${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
--cuda-version $CUDA_VERSION ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${CUDA_VERSION:+--cuda-version "$CUDA_VERSION"}
{% if device == "cuda" %}
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH $LD_LIBRARY_PATH
{% endif %}
...@@ -24,10 +24,19 @@ ...@@ -24,10 +24,19 @@
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
ARG DEVICE
WORKDIR /workspace WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
{% if device == "xpu" %}
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics
{% endif %}
{% if device == "cuda" %}
# Set CUDA_DEVICE_ORDER to ensure CUDA logical device IDs match NVML physical device IDs # Set CUDA_DEVICE_ORDER to ensure CUDA logical device IDs match NVML physical device IDs
# This fixes NVML InvalidArgument errors when CUDA_VISIBLE_DEVICES is set # This fixes NVML InvalidArgument errors when CUDA_VISIBLE_DEVICES is set
ENV CUDA_DEVICE_ORDER=PCI_BUS_ID ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
...@@ -51,13 +60,19 @@ ENV CPATH=/usr/local/cuda/include \ ...@@ -51,13 +60,19 @@ ENV CPATH=/usr/local/cuda/include \
TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \ TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \ TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TRITON_CUDART_PATH=/usr/local/cuda/include TRITON_CUDART_PATH=/usr/local/cuda/include
{% endif %}
### COPY NATS & ETCD ### ### COPY NATS & ETCD ###
# Copy nats and etcd from dev image # Copy nats and etcd from dev image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/ COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
{% if device == "xpu" %}
ENV PATH=/usr/local/bin/etcd/:$PATH
{% else %}
# Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible # Add ETCD and CUDA binaries to PATH so cicc and other CUDA tools are accessible
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
{% endif %}
# Copy uv to system /bin # Copy uv to system /bin
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/ COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
...@@ -82,8 +97,10 @@ ENV PYTHON_VERSION=${PYTHON_VERSION} ...@@ -82,8 +97,10 @@ ENV PYTHON_VERSION=${PYTHON_VERSION}
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds. # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt-get update && \ apt-get update && \
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\ if [ "$DEVICE" = "cuda" ]; then \
CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1) && \ CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\
CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1); \
fi && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Python runtime - CRITICAL for virtual environment to work # Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-dev \
...@@ -104,11 +121,39 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ...@@ -104,11 +121,39 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
# prometheus dependencies # prometheus dependencies
ca-certificates \ ca-certificates \
# opencv-python-headless (vLLM dependency) requires libxcb for some functions # opencv-python-headless (vLLM dependency) requires libxcb for some functions
libxcb1 \ libxcb1 && \
if [ "$DEVICE" = "cuda" ]; then \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image # DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR} && \ cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}; \
fi && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
{% if device == "xpu" %}
RUN apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
#ffmpeg \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
wget \
vim \
linux-libc-dev && \
# Install Intel GPU runtime packages
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing \
intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/intel-oneccl-2021.15.7.8_offline.sh && \
bash intel-oneccl-2021.15.7.8_offline.sh -a --silent --eula accept && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /etc/bash.bashrc && \
rm -f /opt/intel/oneapi/ccl/latest && \
ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest
{% endif %}
{% if context.vllm.enable_media_ffmpeg == "true" %} {% if context.vllm.enable_media_ffmpeg == "true" %}
# Copy ffmpeg libraries from wheel_builder (requires root, runs before USER dynamo) # Copy ffmpeg libraries from wheel_builder (requires root, runs before USER dynamo)
RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
...@@ -124,9 +169,15 @@ ENV HOME=/home/dynamo ...@@ -124,9 +169,15 @@ ENV HOME=/home/dynamo
# This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands # This picks up the umask 002 from the /etc/profile.d/00-umask.sh file for subsequent RUN commands
SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"] SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
{% if device == "xpu" %}
ENV NIXL_PREFIX=/opt/intel/intel_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% else %}
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% endif %}
# Site-packages path derived from PYTHON_VERSION ARG # Site-packages path derived from PYTHON_VERSION ARG
ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
...@@ -138,15 +189,19 @@ ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages ...@@ -138,15 +189,19 @@ ARG SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
# #
# Layer sizes (uncompressed): nvidia=4.5GB, flashinfer_jit_cache=4.1GB, torch=2.1GB, # Layer sizes (uncompressed): nvidia=4.5GB, flashinfer_jit_cache=4.1GB, torch=2.1GB,
# vllm=1.2GB, triton=592MB, flashinfer_cubin=437MB # vllm=1.2GB, triton=592MB, flashinfer_cubin=437MB
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SITE_PACKAGES}/nvidia COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/nvidia ${SITE_PACKAGES}/nvidia
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_jit_cache ${SITE_PACKAGES}/flashinfer_jit_cache
{% endif %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/torch ${SITE_PACKAGES}/torch
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm ${SITE_PACKAGES}/vllm
{% if platform == "amd64" -%} {% if platform == "amd64" -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/vllm_omni ${SITE_PACKAGES}/vllm_omni
{% endif -%} {% endif -%}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/triton ${SITE_PACKAGES}/triton
{% if device == "cuda" %}
COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin COPY --chmod=775 --chown=dynamo:0 --from=framework ${SITE_PACKAGES}/flashinfer_cubin ${SITE_PACKAGES}/flashinfer_cubin
{% endif %}
# Remaining packages and venv structure (bin/, include/, share/, etc.) # Remaining packages and venv structure (bin/, include/, share/, etc.)
COPY --chmod=775 --chown=dynamo:0 --from=framework \ COPY --chmod=775 --chown=dynamo:0 --from=framework \
--exclude=lib/python*/site-packages/nvidia \ --exclude=lib/python*/site-packages/nvidia \
...@@ -166,26 +221,37 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm ...@@ -166,26 +221,37 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm
# Copy UCX and NIXL to system directories (read-only, no group-write needed) # Copy UCX and NIXL to system directories (read-only, no group-write needed)
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
{% if device == "xpu" %}
COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/. ${NIXL_LIB_DIR}/
{% else %}
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/ COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
{% endif %}
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
{% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support) # Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/ COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
{% endif %}
ENV PATH=/usr/local/ucx/bin:$PATH ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$NIXL_LIB_DIR:\ $NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\ $NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\ /usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\ /usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH $LD_LIBRARY_PATH
{% if device == "cuda" %}
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH
ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
{% endif %}
# TODO: skip /workspace COPYs for dev/local-dev (bind-mounted from host, these get shadowed) # TODO: skip /workspace COPYs for dev/local-dev (bind-mounted from host, these get shadowed)
COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/ COPY --chmod=664 --chown=dynamo:0 ATTRIBUTION* LICENSE /workspace/
...@@ -226,6 +292,7 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \ ...@@ -226,6 +292,7 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
uv pip install /opt/dynamo/wheelhouse/nixl/nixl*.whl uv pip install /opt/dynamo/wheelhouse/nixl/nixl*.whl
{% endif %} {% endif %}
{% if device == "cuda" %}
# Install gpu_memory_service wheel if enabled (all targets) # Install gpu_memory_service wheel if enabled (all targets)
ARG ENABLE_GPU_MEMORY_SERVICE ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
...@@ -235,7 +302,6 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \ ...@@ -235,7 +302,6 @@ RUN --mount=type=cache,target=/home/dynamo/.cache/uv,uid=1000,gid=0,mode=0775 \
if [ -n "$GMS_WHEEL" ]; then uv pip install "$GMS_WHEEL"; fi; \ if [ -n "$GMS_WHEEL" ]; then uv pip install "$GMS_WHEEL"; fi; \
fi fi
# Install ModelExpress for P2P weight transfer (optional) # Install ModelExpress for P2P weight transfer (optional)
ARG ENABLE_MODELEXPRESS_P2P ARG ENABLE_MODELEXPRESS_P2P
ARG MODELEXPRESS_REF ARG MODELEXPRESS_REF
...@@ -243,6 +309,7 @@ RUN if [ "${ENABLE_MODELEXPRESS_P2P}" = "true" ]; then \ ...@@ -243,6 +309,7 @@ RUN if [ "${ENABLE_MODELEXPRESS_P2P}" = "true" ]; then \
echo "Installing ModelExpress from ref: ${MODELEXPRESS_REF}" && \ echo "Installing ModelExpress from ref: ${MODELEXPRESS_REF}" && \
uv pip install "modelexpress @ git+https://github.com/ai-dynamo/modelexpress.git@${MODELEXPRESS_REF}#subdirectory=modelexpress_client/python"; \ uv pip install "modelexpress @ git+https://github.com/ai-dynamo/modelexpress.git@${MODELEXPRESS_REF}#subdirectory=modelexpress_client/python"; \
fi fi
{% endif %}
# Install common and test dependencies. Cache uv downloads; uv handles its own locking for this cache. # Install common and test dependencies. Cache uv downloads; uv handles its own locking for this cache.
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
...@@ -274,6 +341,7 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} & ...@@ -274,6 +341,7 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} &
echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \ echo 'source /opt/dynamo/venv/bin/activate' >> /etc/bash.bashrc && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
{% if device == "cuda" %}
# Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks) # Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks)
# This recreates proper symlinks to save space and suppress ldconfig warnings # This recreates proper symlinks to save space and suppress ldconfig warnings
RUN cd /usr/local/lib && \ RUN cd /usr/local/lib && \
...@@ -304,15 +372,25 @@ RUN cd /usr/local/lib && \ ...@@ -304,15 +372,25 @@ RUN cd /usr/local/lib && \
fi; \ fi; \
done && \ done && \
ldconfig ldconfig
{% endif %}
USER dynamo USER dynamo
ARG DYNAMO_COMMIT_SHA ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
{% if device == "xpu" %}
RUN uv pip uninstall triton triton-xpu && \
uv pip install triton-xpu==3.6.0 --extra-index-url=https://download.pytorch.org/whl/test/xpu && \
uv pip uninstall oneccl && \
uv pip uninstall oneccl-devel
SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /etc/bash.bashrc && exec bash"]
{% else %}
# In vLLM 0.12 the default sampler changed on the forward pass. # In vLLM 0.12 the default sampler changed on the forward pass.
# We need to enable this to enable the cuda kernels. # We need to enable this to enable the cuda kernels.
ENV VLLM_USE_FLASHINFER_SAMPLER=1 ENV VLLM_USE_FLASHINFER_SAMPLER=1
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
{% endif %}
...@@ -7,21 +7,21 @@ ...@@ -7,21 +7,21 @@
##### Wheel Build Image ########## ##### Wheel Build Image ##########
################################## ##################################
# Redeclare ARCH_ALT ARG so it's available for interpolation in the FROM instruction FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder
ARG ARCH_ALT
FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder
# Redeclare ARGs for this stage # Redeclare ARGs for this stage
ARG ARCH ARG ARCH
ARG ARCH_ALT ARG ARCH_ALT
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
ARG DEVICE
WORKDIR /workspace WORKDIR /workspace
{% if device == "cuda" %}
# Copy CUDA from base stage # Copy CUDA from base stage
COPY --from=dynamo_base /usr/local/cuda /usr/local/cuda COPY --from=dynamo_base /usr/local/cuda /usr/local/cuda
COPY --from=dynamo_base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf COPY --from=dynamo_base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf
{% endif %}
# Set environment variables first so they can be used in COPY commands # Set environment variables first so they can be used in COPY commands
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
...@@ -34,6 +34,71 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \ ...@@ -34,6 +34,71 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
{% if device == "xpu" %}
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics
RUN wget --tries=3 --waitretry=5 https://raw.githubusercontent.com/intel/llm-scaler/35a14cbc08d714f460a29b7a7328df5620c8530f/vllm/patches/ai-dynamo-xpu/patches/ucx-v1.12.0.patch -O /tmp/ucx.patch
RUN apt clean && apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
#ffmpeg \
ca-certificates \
zip \
unzip \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
libaio-dev \
numactl \
wget \
vim \
linux-libc-dev && \
# Install Intel GPU runtime packages
apt update -y && apt upgrade -y && \
apt-get install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd \
libze-intel-gpu-raytracing intel-ocloc intel-oneapi-compiler-dpcpp-cpp-2025.3 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# NIXL build dependencies
autoconf \
automake \
cmake \
git-lfs \
libtool \
meson \
net-tools \
ninja-build \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall --no-install-recommends \
libibverbs-dev \
rdma-core \
ibverbs-utils \
libibumad-dev \
libnuma-dev \
librdmacm-dev \
ibverbs-providers \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
{% endif %}
{% if device == "cuda" %}
# Install system dependencies # Install system dependencies
# Cache dnf downloads; sharing=locked avoids dnf/rpm races with concurrent builds. # Cache dnf downloads; sharing=locked avoids dnf/rpm races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/dnf,sharing=locked \ RUN --mount=type=cache,target=/var/cache/dnf,sharing=locked \
...@@ -84,7 +149,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \ ...@@ -84,7 +149,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \
LD_LIBRARY_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64:${LD_LIBRARY_PATH}" \
CC="/opt/rh/gcc-toolset-14/root/usr/bin/gcc" \ CC="/opt/rh/gcc-toolset-14/root/usr/bin/gcc" \
CXX="/opt/rh/gcc-toolset-14/root/usr/bin/g++" CXX="/opt/rh/gcc-toolset-14/root/usr/bin/g++"
{% endif %}
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional) # Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \ RUN set -eux; \
...@@ -104,10 +169,16 @@ RUN set -eux; \ ...@@ -104,10 +169,16 @@ RUN set -eux; \
# Point build tools explicitly at the modern protoc # Point build tools explicitly at the modern protoc
ENV PROTOC=/usr/local/bin/protoc ENV PROTOC=/usr/local/bin/protoc
{% if device == "xpu" %}
# Install uv package manager
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH
{% else %}
ENV CUDA_PATH=/usr/local/cuda \ ENV CUDA_PATH=/usr/local/cuda \
PATH=/usr/local/cuda/bin:$PATH \ PATH=/usr/local/cuda/bin:$PATH \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \ LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \
NVIDIA_DRIVER_CAPABILITIES=video,compute,utility NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
{% endif %}
# Create virtual environment for building wheels # Create virtual environment for building wheels
ARG PYTHON_VERSION ARG PYTHON_VERSION
...@@ -120,6 +191,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -120,6 +191,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ARG NIXL_UCX_REF ARG NIXL_UCX_REF
ARG NIXL_REF ARG NIXL_REF
{% if device == "cuda" %}
ARG NIXL_GDRCOPY_REF ARG NIXL_GDRCOPY_REF
# Build and install gdrcopy # Build and install gdrcopy
...@@ -129,6 +202,7 @@ RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/g ...@@ -129,6 +202,7 @@ RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/g
rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \ rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \ rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \
rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm
{% endif %}
# sccache binary is pre-installed in dynamo_base; stage it off-PATH so # sccache binary is pre-installed in dynamo_base; stage it off-PATH so
# Meson doesn't auto-detect it as a CUDA compiler launcher # Meson doesn't auto-detect it as a CUDA compiler launcher
...@@ -159,7 +233,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -159,7 +233,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
dnf install -y pkg-config && \ if [ "$DEVICE" = "xpu" ]; then \
apt-get update -y && apt-get install -y pkg-config; \
apt-get clean && rm -rf /var/lib/apt/lists/*; \
elif [ "$DEVICE" = "cuda" ]; then \
dnf install -y pkg-config; \
fi && \
cd /tmp && \ cd /tmp && \
curl -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \ curl -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \
tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \ tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \
...@@ -195,11 +274,30 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -195,11 +274,30 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
cd /usr/local/src && \ cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \ git clone https://github.com/openucx/ucx.git && \
cd ucx && \ cd ucx && \
git checkout $NIXL_UCX_REF && \ git checkout $NIXL_UCX_REF && \
./autogen.sh && \ if [ "$DEVICE" = "xpu" ]; then \
./contrib/configure-release \ git apply --ignore-whitespace /tmp/ucx.patch; \
fi && \
./autogen.sh && \
if [ "$DEVICE" = "xpu" ]; then \
./contrib/configure-release \
--prefix=/usr/local/ucx \
--with-ze \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-verbs \
--with-dm \
--with-efa \
--without-cuda \
--enable-mt; \
elif [ "$DEVICE" = "cuda" ]; then \
./contrib/configure-release \
--prefix=/usr/local/ucx \ --prefix=/usr/local/ucx \
--enable-shared \ --enable-shared \
--disable-static \ --disable-static \
...@@ -212,7 +310,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -212,7 +310,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--with-dm \ --with-dm \
--with-gdrcopy=/usr/local \ --with-gdrcopy=/usr/local \
--with-efa \ --with-efa \
--enable-mt && \ --enable-mt; \
fi && \
make -j && \ make -j && \
make -j install-strip && \ make -j install-strip && \
/tmp/use-sccache.sh show-stats "UCX" && \ /tmp/use-sccache.sh show-stats "UCX" && \
...@@ -220,6 +319,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -220,6 +319,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \ echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
ldconfig ldconfig
{% if device == "cuda" %}
ARG NIXL_LIBFABRIC_REF ARG NIXL_LIBFABRIC_REF
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
...@@ -248,8 +348,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -248,8 +348,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
/tmp/use-sccache.sh show-stats "LIBFABRIC" && \ /tmp/use-sccache.sh show-stats "LIBFABRIC" && \
echo "/usr/local/libfabric/lib" > /etc/ld.so.conf.d/libfabric.conf && \ echo "/usr/local/libfabric/lib" > /etc/ld.so.conf.d/libfabric.conf && \
ldconfig ldconfig
{% endif %}
{% if framework == "vllm" %} {% if framework == "vllm" and device == "cuda" %}
# Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support) # Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support)
ARG AWS_SDK_CPP_VERSION=1.11.581 ARG AWS_SDK_CPP_VERSION=1.11.581
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
...@@ -277,7 +378,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -277,7 +378,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
{% endif %} {% endif %}
# build and install nixl # build and install nixl
{% if device == "cuda" %}
ARG CUDA_MAJOR ARG CUDA_MAJOR
{% endif %}
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \ export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
...@@ -288,22 +392,38 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -288,22 +392,38 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
git clone "https://github.com/ai-dynamo/nixl.git" && \ git clone "https://github.com/ai-dynamo/nixl.git" && \
cd nixl && \ cd nixl && \
git checkout ${NIXL_REF} && \ git checkout ${NIXL_REF} && \
PKG_NAME="nixl-cu${CUDA_MAJOR}" && \ if [ "$DEVICE" = "cuda" ]; then \
PKG_NAME="nixl-cu${CUDA_MAJOR}"; \
elif [ "$DEVICE" = "xpu" ]; then \
PKG_NAME="nixl-xpu"; \
fi && \
./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml && \ ./contrib/tomlutil.py --wheel-name $PKG_NAME pyproject.toml && \
mkdir build && \ mkdir build && \
meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \ if [ "$DEVICE" = "cuda" ]; then \
-Dcudapath_lib="/usr/local/cuda/lib64" \ meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
-Dcudapath_inc="/usr/local/cuda/include" \ -Dcudapath_lib="/usr/local/cuda/lib64" \
-Ducx_path="/usr/local/ucx" \ -Dcudapath_inc="/usr/local/cuda/include" \
-Dlibfabric_path="/usr/local/libfabric" && \ -Ducx_path="/usr/local/ucx" \
-Dlibfabric_path="/usr/local/libfabric"; \
elif [ "$DEVICE" = "xpu" ]; then \
meson setup build/ --prefix=/opt/intel/intel_nixl --buildtype=release \
-Ducx_path="/usr/local/ucx"; \
fi && \
cd build && \ cd build && \
ninja && \ ninja && \
ninja install && \ ninja install && \
/tmp/use-sccache.sh show-stats "NIXL" /tmp/use-sccache.sh show-stats "NIXL"
{% if device == "xpu" %}
ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \
NIXL_PREFIX=/opt/intel/intel_nixl
{% else %}
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \ ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \ NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
NIXL_PREFIX=/opt/nvidia/nvda_nixl NIXL_PREFIX=/opt/nvidia/nvda_nixl
{% endif %}
ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH}
RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \ RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
...@@ -355,21 +475,24 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -355,21 +475,24 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
else \ else \
maturin build --release --out /opt/dynamo/dist; \ maturin build --release --out /opt/dynamo/dist; \
fi && \ fi && \
if [ "$ENABLE_KVBM" == "true" ]; then \ if [ "$ENABLE_KVBM" = "true" ]; then \
cd /opt/dynamo/lib/bindings/kvbm && \ cd /opt/dynamo/lib/bindings/kvbm && \
maturin build --release --out target/wheels && \ maturin build --release --out target/wheels && \
auditwheel repair \ if [ "$DEVICE" = "cuda" ]; then \
--exclude libnixl.so \ auditwheel repair \
--exclude libnixl_build.so \ --exclude libnixl.so \
--exclude libnixl_common.so \ --exclude libnixl_build.so \
--exclude 'lib*.so*' \ --exclude libnixl_common.so \
--plat manylinux_2_28_${ARCH_ALT} \ --exclude 'lib*.so*' \
--wheel-dir /opt/dynamo/dist \ --plat manylinux_2_28_${ARCH_ALT} \
target/wheels/*.whl; \ --wheel-dir /opt/dynamo/dist \
target/wheels/*.whl; \
elif [ "$DEVICE" = "xpu" ]; then \
cp target/wheels/*.whl /opt/dynamo/dist/; \
fi; \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
{% else %} {% else %}
# Dev/local-dev targets do not have pre-built wheels or /workspace source code. # Dev/local-dev targets do not have pre-built wheels or /workspace source code.
# After you start the local-dev/dev container, you will need to build from source: # After you start the local-dev/dev container, you will need to build from source:
...@@ -387,6 +510,8 @@ COPY lib/gpu_memory_service/ /opt/dynamo/lib/gpu_memory_service/ ...@@ -387,6 +510,8 @@ COPY lib/gpu_memory_service/ /opt/dynamo/lib/gpu_memory_service/
{% endif %} {% endif %}
# Build gpu-memory-service wheel → /opt/dynamo/dist/gpu_memory_service*.whl (small C++ extension, fast build -- all targets, all frameworks) # Build gpu-memory-service wheel → /opt/dynamo/dist/gpu_memory_service*.whl (small C++ extension, fast build -- all targets, all frameworks)
{% if device == "cuda" %}
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \ if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
...@@ -394,3 +519,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -394,3 +519,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
source ${VIRTUAL_ENV}/bin/activate && \ source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \ uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi fi
{% endif %}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment