Unverified Commit da354663 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

chore: Modify Docker templates to support multi arch builds (#6936)

parent 545fa300
......@@ -9,6 +9,31 @@ from pathlib import Path
import yaml
from jinja2 import Environment, FileSystemLoader, StrictUndefined
_VALID_ARCHS = {"amd64", "arm64"}
def parse_platform(platform_str: str) -> str:
"""Normalize a --platform value to the template variable used by Jinja2.
Accepts Docker-style values (linux/amd64, linux/arm64) or short form (amd64,
arm64), and comma-separated lists for multi-arch (linux/amd64,linux/arm64).
Returns one of: 'amd64', 'arm64', or 'multi'.
Raises ValueError for unrecognized architecture values.
"""
parts = [p.strip() for p in platform_str.split(",")]
archs = [p.split("/")[-1] for p in parts]
for arch in archs:
if arch not in _VALID_ARCHS:
raise ValueError(
f"Unrecognized architecture '{arch}' in --platform '{platform_str}'. "
f"Valid architectures: {', '.join(sorted(_VALID_ARCHS))}"
)
if len(archs) > 1:
return "multi"
return archs[0]
def parse_args():
parser = argparse.ArgumentParser(
......@@ -39,8 +64,15 @@ def parse_args():
parser.add_argument(
"--platform",
type=str,
default="amd64",
help="Dockerfile platform to use. [amd64, arm64]",
default="linux/amd64",
help=(
"Target platform(s), Docker-style. Examples:\n"
" linux/amd64 single-arch amd64 build\n"
" linux/arm64 single-arch arm64 build\n"
" linux/amd64,linux/arm64 multi-arch build; the rendered Dockerfile uses\n"
" Docker BuildX TARGETARCH directly (set per platform\n"
" by: docker buildx build --platform linux/amd64,linux/arm64)"
),
)
parser.add_argument(
"--cuda-version",
......@@ -145,7 +177,7 @@ def render(args, context, script_dir):
framework=args.framework,
device=args.device,
target=args.target,
platform=args.platform,
platform=args.platform, # normalized: 'amd64', 'arm64', or 'multi'
cuda_version=args.cuda_version,
make_efa=args.make_efa,
)
......@@ -174,6 +206,9 @@ def render(args, context, script_dir):
def main():
args = parse_args()
# Normalize platform to template variable ('amd64', 'arm64', or 'multi')
# and store it back so render() and validate_args() both see the normalized form.
args.platform = parse_platform(args.platform)
validate_args(args)
# Clear cuda version for non-cuda device
if args.device != "cuda":
......
......@@ -6,18 +6,14 @@
##########################
#### Build Arguments #####
##########################
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
# ARCH: Used for package suffixes (e.g., amd64, arm64)
# ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
# TARGETARCH is set automatically by Docker BuildKit for every --platform build.
# It must NOT be declared in the global scope (before any FROM) — doing so shadows
# the automatic per-platform value that BuildKit injects.
#
# Default values are for x86/amd64:
# --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
# In each stage that needs it, re-declare with: ARG TARGETARCH
#
# For arm64/aarch64, build with:
# --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg
ARG ARCH={{ platform }}
ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }}
# ARCH_ALT (x86_64 / aarch64) is computed inline in RUN steps:
# ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64")
ARG DEVICE={{ device }}
{% if device == "cuda" -%}
{% set device_key = device + cuda_version -%}
......@@ -43,8 +39,11 @@ ARG RUNTIME_IMAGE_TAG={{ context[framework][device_key].runtime_image_tag }}
# wheel builder image selection
{% if device == "xpu" %}
ARG WHEEL_BUILDER_IMAGE=${BASE_IMAGE}:${BASE_IMAGE_TAG}
{% elif platform == "multi" %}
{# Multi-arch: manylinux selection is handled via --platform-pinned stage aliases #}
{# in wheel_builder.Dockerfile using TARGETARCH. No static ARG needed here. #}
{% else %}
ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_${ARCH_ALT}
ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_{{ "x86_64" if platform == "amd64" else "aarch64" }}
{% endif %}
# Build configuration
......
......@@ -13,8 +13,7 @@
# pull those binaries/configs in via COPY.
FROM runtime AS dynamo_tools
ARG ARCH
ARG ARCH_ALT
ARG TARGETARCH
ARG DEVICE
ENV DEBIAN_FRONTEND=noninteractive
......@@ -131,9 +130,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
# Estimated layer size: ~500MB–1.5GB (nsight-systems is a full profiling suite)
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
wget -qO - "https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${ARCH}/nvidia.pub" \
wget -qO - "https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${TARGETARCH}/nvidia.pub" \
| gpg --dearmor -o /etc/apt/keyrings/nvidia-devtools.gpg && \
echo "deb [signed-by=/etc/apt/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${ARCH} /" \
echo "deb [signed-by=/etc/apt/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${TARGETARCH} /" \
| tee /etc/apt/sources.list.d/nvidia-devtools.list && \
apt-get update && \
apt-get install -y --no-install-recommends nsight-systems-2025.5.1 && \
......@@ -193,7 +192,7 @@ RUN if [ ! -e /usr/bin/python3 ]; then \
# wheels, but dev stage needs it for maturin develop and cargo build from source.
# - SGLang: Copy NIXL/UCX/libfabric/gdrcopy binaries from wheel_builder (not in upstream lmsysorg/sglang runtime).
# - vllm/trtllm/none: NIXL/UCX are already present in runtime (no-op).
ARG ARCH_ALT
ARG TARGETARCH
RUN --mount=from=wheel_builder,target=/wheel_builder \
if [ "${FRAMEWORK}" = "sglang" ]; then \
if [ -d /wheel_builder/usr/local/ucx ] && [ -d /wheel_builder/opt/nvidia/nvda_nixl ]; then \
......@@ -204,20 +203,16 @@ RUN --mount=from=wheel_builder,target=/wheel_builder \
cp /wheel_builder/usr/include/gdrapi.h /usr/include/; \
cp /wheel_builder/usr/lib64/libgdrapi.so* /usr/lib64/; \
echo "/usr/lib64" >> /etc/ld.so.conf.d/gdrcopy.conf; \
# SGLang expects ARCH-qualified lib paths; mirror lib64 into lib/${ARCH_ALT}-linux-gnu for parity.
if [ -d /opt/nvidia/nvda_nixl/lib64 ]; then \
mkdir -p /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu; \
cp -r /opt/nvidia/nvda_nixl/lib64/. /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/; \
fi; \
fi; \
fi
# All frameworks use the same path pattern: /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu
# For vllm/trtllm/none: This resets the same values already set in runtime (no harm)
# For sglang: This sets them for the first time (required)
# NIXL is installed under lib64 (manylinux/AlmaLinux convention used by the wheel_builder).
# All frameworks reference NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64.
# For vllm/trtllm/none: This resets the same values already set in runtime (no harm).
# For sglang: This sets them for the first time (required).
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins
# Set universal CUDA development environment variables (all frameworks)
# vLLM: Dockerfile.vllm line 533, 597
......
......@@ -9,8 +9,7 @@
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dynamo_base
ARG ARCH
ARG ARCH_ALT
ARG TARGETARCH
USER root
WORKDIR /opt/dynamo
......@@ -18,7 +17,8 @@ WORKDIR /opt/dynamo
# Install sccache into the base image so downstream stages can COPY it
# instead of downloading from GitHub (avoids 502 errors under parallel builds)
ARG SCCACHE_VERSION=v0.14.0
RUN wget --tries=3 --waitretry=5 \
RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
wget --tries=3 --waitretry=5 \
"https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \
tar -xzf "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \
mv "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl/sccache" /usr/local/bin/ && \
......@@ -31,29 +31,27 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Install NATS server
ARG NATS_VERSION
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${ARCH}.deb && \
dpkg -i nats-server-${NATS_VERSION}-${ARCH}.deb && rm nats-server-${NATS_VERSION}-${ARCH}.deb
wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${TARGETARCH}.deb && \
dpkg -i nats-server-${NATS_VERSION}-${TARGETARCH}.deb && rm nats-server-${NATS_VERSION}-${TARGETARCH}.deb
# Install etcd
ARG ETCD_VERSION
RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${TARGETARCH}.tar.gz -O /tmp/etcd.tar.gz && \
mkdir -p /usr/local/bin/etcd && \
tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
rm /tmp/etcd.tar.gz
ENV PATH=/usr/local/bin/etcd/:$PATH
# Rust Setup
# Rust environment setup
ENV RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
PATH=/usr/local/cargo/bin:$PATH \
RUST_VERSION=1.93.1
# Define Rust target based on ARCH_ALT ARG
ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu
# Install Rust
RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
# Install Rust — ARCH_ALT (x86_64/aarch64) is derived from TARGETARCH at build time
RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
RUSTARCH="${ARCH_ALT}-unknown-linux-gnu" && \
wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
rm rustup-init && \
......
......@@ -9,7 +9,6 @@
FROM dynamo_base AS runtime
ARG ARCH_ALT
ARG PYTHON_VERSION
# Create dynamo user with group 0 for OpenShift compatibility
......@@ -26,8 +25,8 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# NIXL environment variables
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
CARGO_TARGET_DIR=/opt/dynamo/target
ENV LD_LIBRARY_PATH=\
......@@ -40,7 +39,6 @@ ${LD_LIBRARY_PATH}
# Copy ucx and nixl libs
COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx/ /usr/local/ucx/
COPY --chown=dynamo: --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
......
......@@ -18,7 +18,7 @@ RUN apt remove -y python3-apt python3-blinker && \
# This ARG is still utilized for SGLANG Version extraction
ARG RUNTIME_IMAGE_TAG
ARG ARCH_ALT
ARG TARGETARCH
WORKDIR /workspace
# Install NATS and ETCD
......@@ -68,13 +68,12 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src
# NIXL environment and native libraries
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Copy UCX and NIXL native libraries to system directories
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo:0 --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --chown=dynamo:0 --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
ENV PATH=/usr/local/ucx/bin:$PATH
......
......@@ -29,7 +29,7 @@ FROM ${TRTLLM_WHEEL_IMAGE} AS trtllm_wheel_image
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework
ARG ARCH_ALT
ARG TARGETARCH
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
# Install minimal dependencies needed for TensorRT-LLM installation
......@@ -149,6 +149,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
PYTHON_TAG="cp$(echo ${PYTHON_VERSION} | tr -d '.')"; \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
DIRECT_URL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-${TRTLLM_VERSION}-${PYTHON_TAG}-${PYTHON_TAG}-linux_${ARCH_ALT}.whl"; \
uv pip install --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${DIRECT_URL}" triton==3.5.1; \
else \
......
......@@ -24,7 +24,7 @@
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
ARG ARCH_ALT
ARG TARGETARCH
WORKDIR /workspace
ENV ENV=${ENV:-/etc/shinit_v2}
ENV VIRTUAL_ENV=/opt/dynamo/venv
......@@ -56,8 +56,11 @@ ENV CUDA_HOME=/usr/local/cuda \
# Copy OpenMPI from PyTorch base image
COPY --from=pytorch_base /opt/hpcx/ompi /opt/hpcx/ompi
# Copy NUMA library from PyTorch base image
COPY --from=pytorch_base /usr/lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/
# Copy NUMA library from PyTorch base image (arch-dependent path)
RUN --mount=type=bind,from=pytorch_base,source=/usr/lib,target=/mnt/usr_lib \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
mkdir -p /usr/lib/${ARCH_ALT}-linux-gnu && \
cp /mnt/usr_lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/
# Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/
COPY --from=pytorch_base /opt/hpcx /opt/hpcx
......@@ -93,6 +96,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
ARG PYTHON_VERSION
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
if [ ${ARCH_ALT} = "x86_64" ]; then \
ARCH_FOR_GPG=${ARCH_ALT}; \
else \
......@@ -160,6 +164,19 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
{% endif %}
# Copy TensorRT and libgomp from framework image (arch-dependent path, needs root)
COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt
RUN --mount=type=bind,from=framework,source=/usr/lib,target=/mnt/usr_lib \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
cp /mnt/usr_lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/
# Register arch-dependent TensorRT and nvshmem library paths with ldconfig so the
# dynamic linker finds them in every execution context (docker run, exec, k8s, etc.)
RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
echo "/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib" > /etc/ld.so.conf.d/tensorrt.conf && \
echo "/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13" >> /etc/ld.so.conf.d/tensorrt.conf && \
ldconfig
# Switch to dynamo user
USER dynamo
ENV HOME=/home/dynamo
......@@ -168,13 +185,9 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
ENV DYNAMO_HOME=/workspace
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Copy libgomp.so from framework image
COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt
COPY --from=framework /usr/lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/
# Copy pre-built venv with PyTorch and TensorRT-LLM from framework stage
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
......@@ -184,20 +197,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
# Copy dynamo wheels for gitlab artifacts (read-only, no group-write needed)
COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib
ENV PATH="/usr/local/ucx/bin:${VIRTUAL_ENV}/bin:/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin:$PATH"
# Both arch paths are listed; the non-existent one is silently ignored by the linker.
ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
/opt/hpcx/ompi/lib:\
/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:\
$TENSORRT_LIB_DIR:\
/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:\
/usr/local/tensorrt/targets/aarch64-linux-gnu/lib:\
/usr/lib/x86_64-linux-gnu/nvshmem/13/:\
/usr/lib/aarch64-linux-gnu/nvshmem/13/:\
/opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch/lib:\
/opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt/lib:\
/usr/local/cuda/lib:\
......
......@@ -61,7 +61,7 @@ RUN mkdir -p /opt/dynamo/venv && \
ENV VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:${PATH}"
ARG ARCH
ARG TARGETARCH
# Install vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
ARG VLLM_REF
......@@ -98,7 +98,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
--device $DEVICE \
--vllm-ref $VLLM_REF \
--max-jobs $MAX_JOBS \
--arch $ARCH \
--arch $TARGETARCH \
--installation-dir /opt \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
......
......@@ -89,7 +89,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
&& mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
ARG ARCH_ALT
ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}
......@@ -171,11 +170,11 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
{% if device == "xpu" %}
ENV NIXL_PREFIX=/opt/intel/intel_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/x86_64-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% else %}
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% endif %}
......@@ -222,20 +221,13 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
{% if device == "xpu" %}
COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/. ${NIXL_LIB_DIR}/
{% else %}
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
{# XPU NIXL uses lib/x86_64-linux-gnu; copy to NIXL_LIB_DIR to ensure lib dir is populated #}
COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/x86_64-linux-gnu/. ${NIXL_LIB_DIR}/
{% endif %}
{# For cuda: NIXL_LIB_DIR = lib64, already included in the $NIXL_PREFIX COPY above #}
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
{% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
{% endif %}
ENV PATH=/usr/local/ucx/bin:$PATH
......@@ -347,6 +339,12 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} &
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
{% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
# Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks)
# This recreates proper symlinks to save space and suppress ldconfig warnings
RUN cd /usr/local/lib && \
......
......@@ -7,16 +7,28 @@
##### Wheel Build Image ##########
##################################
{% if platform == "multi" and device == "cuda" %}
# Multi-arch: declare both manylinux base images with explicit --platform so each is
# always pulled as the correct native arch regardless of the current TARGETPLATFORM.
# BuildKit only fetches and builds the stage that TARGETARCH resolves to; the other
# is a no-op for each sub-build.
FROM --platform=linux/amd64 quay.io/pypa/manylinux_2_28_x86_64 AS manylinux_amd64
FROM --platform=linux/arm64 quay.io/pypa/manylinux_2_28_aarch64 AS manylinux_arm64
{% endif %}
##################################
##### wheel_builder_base #########
##################################
# Shared base for all wheel builds: tools, system deps, and native libraries (except nixl).
{% if platform == "multi" and device == "cuda" %}
FROM manylinux_${TARGETARCH} AS wheel_builder_base
{% else %}
FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder_base
{% endif %}
# Redeclare ARGs for this stage
ARG ARCH
ARG ARCH_ALT
ARG TARGETARCH
ARG CARGO_BUILD_JOBS
ARG DEVICE
......@@ -35,6 +47,8 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
CARGO_TARGET_DIR=/opt/dynamo/target \
PATH=/usr/local/cargo/bin:$PATH
# Copy artifacts from base stage
COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
......@@ -158,6 +172,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
PROTOC_VERSION=25.3; \
case "${ARCH_ALT}" in \
x86_64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-x86_64.zip" ;; \
......@@ -200,7 +215,8 @@ ARG NIXL_UCX_REF
ARG NIXL_GDRCOPY_REF
# Build and install gdrcopy
RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
cd gdrcopy/packages && \
CUDA=/usr/local/cuda ./build-rpm-packages.sh && \
rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
......@@ -233,7 +249,7 @@ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
ARG FFMPEG_VERSION
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \
fi && \
......@@ -273,7 +289,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
# Build and install UCX
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \
fi && \
......@@ -327,7 +343,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
ARG NIXL_LIBFABRIC_REF
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \
fi && \
......@@ -359,7 +375,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
ARG AWS_SDK_CPP_VERSION=1.11.760
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env cmake); \
fi && \
......@@ -397,7 +413,6 @@ COPY lib/ /opt/dynamo/lib/
COPY components/ /opt/dynamo/components/
# Build ai-dynamo (pure Python) and ai-dynamo-runtime (maturin) wheels
ARG ARCH
ARG USE_SCCACHE
ARG ENABLE_MEDIA_FFMPEG
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
......@@ -406,7 +421,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/root/.cache/uv \
export UV_CACHE_DIR=/root/.cache/uv && \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env cmake); \
fi && \
......@@ -460,8 +475,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
FROM wheel_builder_base AS wheel_builder
# Build and install nixl
ARG ARCH
ARG ARCH_ALT
ARG TARGETARCH
ARG DEVICE
ARG NIXL_REF
ARG USE_SCCACHE
......@@ -471,7 +485,7 @@ ARG CUDA_MAJOR
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \
fi && \
......@@ -502,8 +516,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
/tmp/use-sccache.sh show-stats "NIXL"
{% if device == "xpu" %}
ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \
{# XPU only supports x86_64; no ARCH_ALT ARG needed #}
ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu \
NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu/plugins \
NIXL_PREFIX=/opt/intel/intel_nixl
{% else %}
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
......@@ -523,7 +538,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
--mount=type=cache,target=/root/.cache/uv \
export UV_CACHE_DIR=/root/.cache/uv && \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \
fi && \
......@@ -545,7 +560,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/root/.cache/uv \
export UV_CACHE_DIR=/root/.cache/uv && \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env cmake); \
fi && \
......
......@@ -15,7 +15,7 @@ usage() {
Usage: $0 [COMMAND] [OPTIONS]
Commands:
install Install sccache binary (requires ARCH_ALT environment variable)
install Install sccache binary (architecture auto-detected via uname -m)
setup-env Print export statements to configure sccache for compilation
show-stats Display sccache statistics with optional build name
help Show this help message
......@@ -33,10 +33,9 @@ Environment variables:
SCCACHE_BUCKET S3 bucket name (fallback if not passed as parameter)
SCCACHE_REGION S3 region (fallback if not passed as parameter)
ARCH Architecture for S3 key prefix (fallback if not passed as parameter)
ARCH_ALT Alternative architecture name for downloads (e.g., x86_64, aarch64)
Examples:
ARCH_ALT=x86_64 $0 install
$0 install # architecture auto-detected via uname -m
eval \$($0 setup-env) # autotools / Meson
eval \$($0 setup-env cmake) # CMake builds
$0 show-stats "UCX"
......@@ -44,18 +43,21 @@ EOF
}
install_sccache() {
# Derive arch from TARGETARCH (set by BuildKit) with uname -m fallback
local arch_alt
if [ -n "${TARGETARCH:-}" ]; then
arch_alt=$([ "$TARGETARCH" = "amd64" ] && echo "x86_64" || echo "aarch64")
else
arch_alt=$(uname -m)
fi
if command -v sccache >/dev/null 2>&1; then
echo "sccache already installed at $(command -v sccache), skipping download"
else
if [ -z "${ARCH_ALT:-}" ]; then
echo "Error: ARCH_ALT environment variable is required for sccache installation"
exit 1
fi
echo "Installing sccache ${SCCACHE_VERSION} for architecture ${ARCH_ALT}..."
echo "Installing sccache ${SCCACHE_VERSION} for architecture ${arch_alt}..."
wget --tries=3 --waitretry=5 \
"https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz"
tar -xzf "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz"
mv "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl/sccache" /usr/local/bin/
"https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl.tar.gz"
tar -xzf "sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl.tar.gz"
mv "sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl/sccache" /usr/local/bin/
rm -rf sccache*
fi
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment