Unverified Commit da354663 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

chore: Modify Docker templates to support multi arch builds (#6936)

parent 545fa300
...@@ -9,6 +9,31 @@ from pathlib import Path ...@@ -9,6 +9,31 @@ from pathlib import Path
import yaml import yaml
from jinja2 import Environment, FileSystemLoader, StrictUndefined from jinja2 import Environment, FileSystemLoader, StrictUndefined
_VALID_ARCHS = {"amd64", "arm64"}
def parse_platform(platform_str: str) -> str:
"""Normalize a --platform value to the template variable used by Jinja2.
Accepts Docker-style values (linux/amd64, linux/arm64) or short form (amd64,
arm64), and comma-separated lists for multi-arch (linux/amd64,linux/arm64).
Returns one of: 'amd64', 'arm64', or 'multi'.
Raises ValueError for unrecognized architecture values.
"""
parts = [p.strip() for p in platform_str.split(",")]
archs = [p.split("/")[-1] for p in parts]
for arch in archs:
if arch not in _VALID_ARCHS:
raise ValueError(
f"Unrecognized architecture '{arch}' in --platform '{platform_str}'. "
f"Valid architectures: {', '.join(sorted(_VALID_ARCHS))}"
)
if len(archs) > 1:
return "multi"
return archs[0]
def parse_args(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
...@@ -39,8 +64,15 @@ def parse_args(): ...@@ -39,8 +64,15 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--platform", "--platform",
type=str, type=str,
default="amd64", default="linux/amd64",
help="Dockerfile platform to use. [amd64, arm64]", help=(
"Target platform(s), Docker-style. Examples:\n"
" linux/amd64 single-arch amd64 build\n"
" linux/arm64 single-arch arm64 build\n"
" linux/amd64,linux/arm64 multi-arch build; the rendered Dockerfile uses\n"
" Docker BuildX TARGETARCH directly (set per platform\n"
" by: docker buildx build --platform linux/amd64,linux/arm64)"
),
) )
parser.add_argument( parser.add_argument(
"--cuda-version", "--cuda-version",
...@@ -145,7 +177,7 @@ def render(args, context, script_dir): ...@@ -145,7 +177,7 @@ def render(args, context, script_dir):
framework=args.framework, framework=args.framework,
device=args.device, device=args.device,
target=args.target, target=args.target,
platform=args.platform, platform=args.platform, # normalized: 'amd64', 'arm64', or 'multi'
cuda_version=args.cuda_version, cuda_version=args.cuda_version,
make_efa=args.make_efa, make_efa=args.make_efa,
) )
...@@ -174,6 +206,9 @@ def render(args, context, script_dir): ...@@ -174,6 +206,9 @@ def render(args, context, script_dir):
def main(): def main():
args = parse_args() args = parse_args()
# Normalize platform to template variable ('amd64', 'arm64', or 'multi')
# and store it back so render() and validate_args() both see the normalized form.
args.platform = parse_platform(args.platform)
validate_args(args) validate_args(args)
# Clear cuda version for non-cuda device # Clear cuda version for non-cuda device
if args.device != "cuda": if args.device != "cuda":
......
...@@ -6,18 +6,14 @@ ...@@ -6,18 +6,14 @@
########################## ##########################
#### Build Arguments ##### #### Build Arguments #####
########################## ##########################
# Define general architecture ARGs for supporting both x86 and aarch64 builds. # TARGETARCH is set automatically by Docker BuildKit for every --platform build.
# ARCH: Used for package suffixes (e.g., amd64, arm64) # It must NOT be declared in the global scope (before any FROM) — doing so shadows
# ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64) # the automatic per-platform value that BuildKit injects.
# #
# Default values are for x86/amd64: # In each stage that needs it, re-declare with: ARG TARGETARCH
# --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
# #
# For arm64/aarch64, build with: # ARCH_ALT (x86_64 / aarch64) is computed inline in RUN steps:
# --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64 # ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64")
#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg
ARG ARCH={{ platform }}
ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }}
ARG DEVICE={{ device }} ARG DEVICE={{ device }}
{% if device == "cuda" -%} {% if device == "cuda" -%}
{% set device_key = device + cuda_version -%} {% set device_key = device + cuda_version -%}
...@@ -43,8 +39,11 @@ ARG RUNTIME_IMAGE_TAG={{ context[framework][device_key].runtime_image_tag }} ...@@ -43,8 +39,11 @@ ARG RUNTIME_IMAGE_TAG={{ context[framework][device_key].runtime_image_tag }}
# wheel builder image selection # wheel builder image selection
{% if device == "xpu" %} {% if device == "xpu" %}
ARG WHEEL_BUILDER_IMAGE=${BASE_IMAGE}:${BASE_IMAGE_TAG} ARG WHEEL_BUILDER_IMAGE=${BASE_IMAGE}:${BASE_IMAGE_TAG}
{% elif platform == "multi" %}
{# Multi-arch: manylinux selection is handled via --platform-pinned stage aliases #}
{# in wheel_builder.Dockerfile using TARGETARCH. No static ARG needed here. #}
{% else %} {% else %}
ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_${ARCH_ALT} ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_{{ "x86_64" if platform == "amd64" else "aarch64" }}
{% endif %} {% endif %}
# Build configuration # Build configuration
......
...@@ -13,8 +13,7 @@ ...@@ -13,8 +13,7 @@
# pull those binaries/configs in via COPY. # pull those binaries/configs in via COPY.
FROM runtime AS dynamo_tools FROM runtime AS dynamo_tools
ARG ARCH ARG TARGETARCH
ARG ARCH_ALT
ARG DEVICE ARG DEVICE
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
...@@ -131,9 +130,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ...@@ -131,9 +130,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
# Estimated layer size: ~500MB–1.5GB (nsight-systems is a full profiling suite) # Estimated layer size: ~500MB–1.5GB (nsight-systems is a full profiling suite)
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds. # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
wget -qO - "https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${ARCH}/nvidia.pub" \ wget -qO - "https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${TARGETARCH}/nvidia.pub" \
| gpg --dearmor -o /etc/apt/keyrings/nvidia-devtools.gpg && \ | gpg --dearmor -o /etc/apt/keyrings/nvidia-devtools.gpg && \
echo "deb [signed-by=/etc/apt/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${ARCH} /" \ echo "deb [signed-by=/etc/apt/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${TARGETARCH} /" \
| tee /etc/apt/sources.list.d/nvidia-devtools.list && \ | tee /etc/apt/sources.list.d/nvidia-devtools.list && \
apt-get update && \ apt-get update && \
apt-get install -y --no-install-recommends nsight-systems-2025.5.1 && \ apt-get install -y --no-install-recommends nsight-systems-2025.5.1 && \
...@@ -193,7 +192,7 @@ RUN if [ ! -e /usr/bin/python3 ]; then \ ...@@ -193,7 +192,7 @@ RUN if [ ! -e /usr/bin/python3 ]; then \
# wheels, but dev stage needs it for maturin develop and cargo build from source. # wheels, but dev stage needs it for maturin develop and cargo build from source.
# - SGLang: Copy NIXL/UCX/libfabric/gdrcopy binaries from wheel_builder (not in upstream lmsysorg/sglang runtime). # - SGLang: Copy NIXL/UCX/libfabric/gdrcopy binaries from wheel_builder (not in upstream lmsysorg/sglang runtime).
# - vllm/trtllm/none: NIXL/UCX are already present in runtime (no-op). # - vllm/trtllm/none: NIXL/UCX are already present in runtime (no-op).
ARG ARCH_ALT ARG TARGETARCH
RUN --mount=from=wheel_builder,target=/wheel_builder \ RUN --mount=from=wheel_builder,target=/wheel_builder \
if [ "${FRAMEWORK}" = "sglang" ]; then \ if [ "${FRAMEWORK}" = "sglang" ]; then \
if [ -d /wheel_builder/usr/local/ucx ] && [ -d /wheel_builder/opt/nvidia/nvda_nixl ]; then \ if [ -d /wheel_builder/usr/local/ucx ] && [ -d /wheel_builder/opt/nvidia/nvda_nixl ]; then \
...@@ -204,20 +203,16 @@ RUN --mount=from=wheel_builder,target=/wheel_builder \ ...@@ -204,20 +203,16 @@ RUN --mount=from=wheel_builder,target=/wheel_builder \
cp /wheel_builder/usr/include/gdrapi.h /usr/include/; \ cp /wheel_builder/usr/include/gdrapi.h /usr/include/; \
cp /wheel_builder/usr/lib64/libgdrapi.so* /usr/lib64/; \ cp /wheel_builder/usr/lib64/libgdrapi.so* /usr/lib64/; \
echo "/usr/lib64" >> /etc/ld.so.conf.d/gdrcopy.conf; \ echo "/usr/lib64" >> /etc/ld.so.conf.d/gdrcopy.conf; \
# SGLang expects ARCH-qualified lib paths; mirror lib64 into lib/${ARCH_ALT}-linux-gnu for parity.
if [ -d /opt/nvidia/nvda_nixl/lib64 ]; then \
mkdir -p /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu; \
cp -r /opt/nvidia/nvda_nixl/lib64/. /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/; \
fi; \
fi; \ fi; \
fi fi
# All frameworks use the same path pattern: /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu # NIXL is installed under lib64 (manylinux/AlmaLinux convention used by the wheel_builder).
# For vllm/trtllm/none: This resets the same values already set in runtime (no harm) # All frameworks reference NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64.
# For sglang: This sets them for the first time (required) # For vllm/trtllm/none: This resets the same values already set in runtime (no harm).
# For sglang: This sets them for the first time (required).
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \ ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \ NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins
# Set universal CUDA development environment variables (all frameworks) # Set universal CUDA development environment variables (all frameworks)
# vLLM: Dockerfile.vllm line 533, 597 # vLLM: Dockerfile.vllm line 533, 597
......
...@@ -9,8 +9,7 @@ ...@@ -9,8 +9,7 @@
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dynamo_base FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dynamo_base
ARG ARCH ARG TARGETARCH
ARG ARCH_ALT
USER root USER root
WORKDIR /opt/dynamo WORKDIR /opt/dynamo
...@@ -18,7 +17,8 @@ WORKDIR /opt/dynamo ...@@ -18,7 +17,8 @@ WORKDIR /opt/dynamo
# Install sccache into the base image so downstream stages can COPY it # Install sccache into the base image so downstream stages can COPY it
# instead of downloading from GitHub (avoids 502 errors under parallel builds) # instead of downloading from GitHub (avoids 502 errors under parallel builds)
ARG SCCACHE_VERSION=v0.14.0 ARG SCCACHE_VERSION=v0.14.0
RUN wget --tries=3 --waitretry=5 \ RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
wget --tries=3 --waitretry=5 \
"https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \ "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \
tar -xzf "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \ tar -xzf "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \
mv "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl/sccache" /usr/local/bin/ && \ mv "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl/sccache" /usr/local/bin/ && \
...@@ -31,29 +31,27 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ ...@@ -31,29 +31,27 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Install NATS server # Install NATS server
ARG NATS_VERSION ARG NATS_VERSION
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${ARCH}.deb && \ wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${TARGETARCH}.deb && \
dpkg -i nats-server-${NATS_VERSION}-${ARCH}.deb && rm nats-server-${NATS_VERSION}-${ARCH}.deb dpkg -i nats-server-${NATS_VERSION}-${TARGETARCH}.deb && rm nats-server-${NATS_VERSION}-${TARGETARCH}.deb
# Install etcd # Install etcd
ARG ETCD_VERSION ARG ETCD_VERSION
RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${TARGETARCH}.tar.gz -O /tmp/etcd.tar.gz && \
mkdir -p /usr/local/bin/etcd && \ mkdir -p /usr/local/bin/etcd && \
tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \ tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
rm /tmp/etcd.tar.gz rm /tmp/etcd.tar.gz
ENV PATH=/usr/local/bin/etcd/:$PATH ENV PATH=/usr/local/bin/etcd/:$PATH
# Rust Setup # Rust Setup
# Rust environment setup
ENV RUSTUP_HOME=/usr/local/rustup \ ENV RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \ CARGO_HOME=/usr/local/cargo \
PATH=/usr/local/cargo/bin:$PATH \ PATH=/usr/local/cargo/bin:$PATH \
RUST_VERSION=1.93.1 RUST_VERSION=1.93.1
# Define Rust target based on ARCH_ALT ARG # Install Rust — ARCH_ALT (x86_64/aarch64) is derived from TARGETARCH at build time
ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
RUSTARCH="${ARCH_ALT}-unknown-linux-gnu" && \
# Install Rust wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
chmod +x rustup-init && \ chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
rm rustup-init && \ rm rustup-init && \
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
FROM dynamo_base AS runtime FROM dynamo_base AS runtime
ARG ARCH_ALT
ARG PYTHON_VERSION ARG PYTHON_VERSION
# Create dynamo user with group 0 for OpenShift compatibility # Create dynamo user with group 0 for OpenShift compatibility
...@@ -26,8 +25,8 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \ ...@@ -26,8 +25,8 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# NIXL environment variables # NIXL environment variables
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \ ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \ NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \ NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
CARGO_TARGET_DIR=/opt/dynamo/target CARGO_TARGET_DIR=/opt/dynamo/target
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
...@@ -40,7 +39,6 @@ ${LD_LIBRARY_PATH} ...@@ -40,7 +39,6 @@ ${LD_LIBRARY_PATH}
# Copy ucx and nixl libs # Copy ucx and nixl libs
COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx/ /usr/local/ucx/ COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx/ /usr/local/ucx/
COPY --chown=dynamo: --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/ COPY --chown=dynamo: --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
......
...@@ -18,7 +18,7 @@ RUN apt remove -y python3-apt python3-blinker && \ ...@@ -18,7 +18,7 @@ RUN apt remove -y python3-apt python3-blinker && \
# This ARG is still utilized for SGLANG Version extraction # This ARG is still utilized for SGLANG Version extraction
ARG RUNTIME_IMAGE_TAG ARG RUNTIME_IMAGE_TAG
ARG ARCH_ALT ARG TARGETARCH
WORKDIR /workspace WORKDIR /workspace
# Install NATS and ETCD # Install NATS and ETCD
...@@ -68,13 +68,12 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src ...@@ -68,13 +68,12 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src
# NIXL environment and native libraries # NIXL environment and native libraries
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Copy UCX and NIXL native libraries to system directories # Copy UCX and NIXL native libraries to system directories
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo:0 --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX COPY --chown=dynamo:0 --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --chown=dynamo:0 --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
ENV PATH=/usr/local/ucx/bin:$PATH ENV PATH=/usr/local/ucx/bin:$PATH
......
...@@ -29,7 +29,7 @@ FROM ${TRTLLM_WHEEL_IMAGE} AS trtllm_wheel_image ...@@ -29,7 +29,7 @@ FROM ${TRTLLM_WHEEL_IMAGE} AS trtllm_wheel_image
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework
ARG ARCH_ALT ARG TARGETARCH
COPY --from=dynamo_base /bin/uv /bin/uvx /bin/ COPY --from=dynamo_base /bin/uv /bin/uvx /bin/
# Install minimal dependencies needed for TensorRT-LLM installation # Install minimal dependencies needed for TensorRT-LLM installation
...@@ -149,6 +149,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -149,6 +149,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \ if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \ TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
PYTHON_TAG="cp$(echo ${PYTHON_VERSION} | tr -d '.')"; \ PYTHON_TAG="cp$(echo ${PYTHON_VERSION} | tr -d '.')"; \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
DIRECT_URL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-${TRTLLM_VERSION}-${PYTHON_TAG}-${PYTHON_TAG}-linux_${ARCH_ALT}.whl"; \ DIRECT_URL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-${TRTLLM_VERSION}-${PYTHON_TAG}-${PYTHON_TAG}-linux_${ARCH_ALT}.whl"; \
uv pip install --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${DIRECT_URL}" triton==3.5.1; \ uv pip install --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${DIRECT_URL}" triton==3.5.1; \
else \ else \
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
ARG ARCH_ALT ARG TARGETARCH
WORKDIR /workspace WORKDIR /workspace
ENV ENV=${ENV:-/etc/shinit_v2} ENV ENV=${ENV:-/etc/shinit_v2}
ENV VIRTUAL_ENV=/opt/dynamo/venv ENV VIRTUAL_ENV=/opt/dynamo/venv
...@@ -56,8 +56,11 @@ ENV CUDA_HOME=/usr/local/cuda \ ...@@ -56,8 +56,11 @@ ENV CUDA_HOME=/usr/local/cuda \
# Copy OpenMPI from PyTorch base image # Copy OpenMPI from PyTorch base image
COPY --from=pytorch_base /opt/hpcx/ompi /opt/hpcx/ompi COPY --from=pytorch_base /opt/hpcx/ompi /opt/hpcx/ompi
# Copy NUMA library from PyTorch base image # Copy NUMA library from PyTorch base image (arch-dependent path)
COPY --from=pytorch_base /usr/lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/ RUN --mount=type=bind,from=pytorch_base,source=/usr/lib,target=/mnt/usr_lib \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
mkdir -p /usr/lib/${ARCH_ALT}-linux-gnu && \
cp /mnt/usr_lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/
# Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/ # Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/
COPY --from=pytorch_base /opt/hpcx /opt/hpcx COPY --from=pytorch_base /opt/hpcx /opt/hpcx
...@@ -93,6 +96,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \ ...@@ -93,6 +96,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds. # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
ARG PYTHON_VERSION ARG PYTHON_VERSION
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
if [ ${ARCH_ALT} = "x86_64" ]; then \ if [ ${ARCH_ALT} = "x86_64" ]; then \
ARCH_FOR_GPG=${ARCH_ALT}; \ ARCH_FOR_GPG=${ARCH_ALT}; \
else \ else \
...@@ -160,6 +164,19 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca ...@@ -160,6 +164,19 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
{% endif %} {% endif %}
# Copy TensorRT and libgomp from framework image (arch-dependent path, needs root)
COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt
RUN --mount=type=bind,from=framework,source=/usr/lib,target=/mnt/usr_lib \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
cp /mnt/usr_lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/
# Register arch-dependent TensorRT and nvshmem library paths with ldconfig so the
# dynamic linker finds them in every execution context (docker run, exec, k8s, etc.)
RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
echo "/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib" > /etc/ld.so.conf.d/tensorrt.conf && \
echo "/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13" >> /etc/ld.so.conf.d/tensorrt.conf && \
ldconfig
# Switch to dynamo user # Switch to dynamo user
USER dynamo USER dynamo
ENV HOME=/home/dynamo ENV HOME=/home/dynamo
...@@ -168,13 +185,9 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"] ...@@ -168,13 +185,9 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
ENV DYNAMO_HOME=/workspace ENV DYNAMO_HOME=/workspace
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Copy libgomp.so from framework image
COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt
COPY --from=framework /usr/lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/
# Copy pre-built venv with PyTorch and TensorRT-LLM from framework stage # Copy pre-built venv with PyTorch and TensorRT-LLM from framework stage
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path> # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV} COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
...@@ -184,20 +197,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV} ...@@ -184,20 +197,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
# Copy dynamo wheels for gitlab artifacts (read-only, no group-write needed) # Copy dynamo wheels for gitlab artifacts (read-only, no group-write needed)
COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx /usr/local/ucx COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib
ENV PATH="/usr/local/ucx/bin:${VIRTUAL_ENV}/bin:/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin:$PATH" ENV PATH="/usr/local/ucx/bin:${VIRTUAL_ENV}/bin:/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin:$PATH"
# Both arch paths are listed; the non-existent one is silently ignored by the linker.
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\ $NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\ $NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\ /usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\ /usr/local/ucx/lib/ucx:\
/opt/hpcx/ompi/lib:\ /opt/hpcx/ompi/lib:\
/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:\ /usr/local/tensorrt/targets/x86_64-linux-gnu/lib:\
$TENSORRT_LIB_DIR:\ /usr/local/tensorrt/targets/aarch64-linux-gnu/lib:\
/usr/lib/x86_64-linux-gnu/nvshmem/13/:\
/usr/lib/aarch64-linux-gnu/nvshmem/13/:\
/opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch/lib:\ /opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch/lib:\
/opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt/lib:\ /opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt/lib:\
/usr/local/cuda/lib:\ /usr/local/cuda/lib:\
......
...@@ -61,7 +61,7 @@ RUN mkdir -p /opt/dynamo/venv && \ ...@@ -61,7 +61,7 @@ RUN mkdir -p /opt/dynamo/venv && \
ENV VIRTUAL_ENV=/opt/dynamo/venv \ ENV VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:${PATH}" PATH="/opt/dynamo/venv/bin:${PATH}"
ARG ARCH ARG TARGETARCH
# Install vllm - keep this early in Dockerfile to avoid # Install vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes # rebuilds from unrelated source code changes
ARG VLLM_REF ARG VLLM_REF
...@@ -98,7 +98,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ ...@@ -98,7 +98,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
--device $DEVICE \ --device $DEVICE \
--vllm-ref $VLLM_REF \ --vllm-ref $VLLM_REF \
--max-jobs $MAX_JOBS \ --max-jobs $MAX_JOBS \
--arch $ARCH \ --arch $TARGETARCH \
--installation-dir /opt \ --installation-dir /opt \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \ ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \ ${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \
......
...@@ -89,7 +89,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \ ...@@ -89,7 +89,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
# NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
&& mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
ARG ARCH_ALT
ARG PYTHON_VERSION ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION} ENV PYTHON_VERSION=${PYTHON_VERSION}
...@@ -171,11 +170,11 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"] ...@@ -171,11 +170,11 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]
{% if device == "xpu" %} {% if device == "xpu" %}
ENV NIXL_PREFIX=/opt/intel/intel_nixl ENV NIXL_PREFIX=/opt/intel/intel_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/x86_64-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% else %} {% else %}
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
{% endif %} {% endif %}
...@@ -222,20 +221,13 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm ...@@ -222,20 +221,13 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm
COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
{% if device == "xpu" %} {% if device == "xpu" %}
COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/. ${NIXL_LIB_DIR}/ {# XPU NIXL uses lib/x86_64-linux-gnu; copy to NIXL_LIB_DIR to ensure lib dir is populated #}
{% else %} COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/x86_64-linux-gnu/. ${NIXL_LIB_DIR}/
COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
{% endif %} {% endif %}
{# For cuda: NIXL_LIB_DIR = lib64, already included in the $NIXL_PREFIX COPY above #}
COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
{% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
{% endif %}
ENV PATH=/usr/local/ucx/bin:$PATH ENV PATH=/usr/local/ucx/bin:$PATH
...@@ -347,6 +339,12 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} & ...@@ -347,6 +339,12 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} &
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
{% if device == "cuda" %} {% if device == "cuda" %}
# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
# Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks) # Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks)
# This recreates proper symlinks to save space and suppress ldconfig warnings # This recreates proper symlinks to save space and suppress ldconfig warnings
RUN cd /usr/local/lib && \ RUN cd /usr/local/lib && \
......
...@@ -7,16 +7,28 @@ ...@@ -7,16 +7,28 @@
##### Wheel Build Image ########## ##### Wheel Build Image ##########
################################## ##################################
{% if platform == "multi" and device == "cuda" %}
# Multi-arch: declare both manylinux base images with explicit --platform so each is
# always pulled as the correct native arch regardless of the current TARGETPLATFORM.
# BuildKit only fetches and builds the stage that TARGETARCH resolves to; the other
# is a no-op for each sub-build.
FROM --platform=linux/amd64 quay.io/pypa/manylinux_2_28_x86_64 AS manylinux_amd64
FROM --platform=linux/arm64 quay.io/pypa/manylinux_2_28_aarch64 AS manylinux_arm64
{% endif %}
################################## ##################################
##### wheel_builder_base ######### ##### wheel_builder_base #########
################################## ##################################
# Shared base for all wheel builds: tools, system deps, and native libraries (except nixl). # Shared base for all wheel builds: tools, system deps, and native libraries (except nixl).
{% if platform == "multi" and device == "cuda" %}
FROM manylinux_${TARGETARCH} AS wheel_builder_base
{% else %}
FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder_base FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder_base
{% endif %}
# Redeclare ARGs for this stage # Redeclare ARGs for this stage
ARG ARCH ARG TARGETARCH
ARG ARCH_ALT
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
ARG DEVICE ARG DEVICE
...@@ -35,6 +47,8 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \ ...@@ -35,6 +47,8 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
CARGO_TARGET_DIR=/opt/dynamo/target \ CARGO_TARGET_DIR=/opt/dynamo/target \
PATH=/usr/local/cargo/bin:$PATH PATH=/usr/local/cargo/bin:$PATH
# Copy artifacts from base stage # Copy artifacts from base stage
COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
...@@ -158,6 +172,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \ ...@@ -158,6 +172,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional) # Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \ RUN set -eux; \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
PROTOC_VERSION=25.3; \ PROTOC_VERSION=25.3; \
case "${ARCH_ALT}" in \ case "${ARCH_ALT}" in \
x86_64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-x86_64.zip" ;; \ x86_64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-x86_64.zip" ;; \
...@@ -200,7 +215,8 @@ ARG NIXL_UCX_REF ...@@ -200,7 +215,8 @@ ARG NIXL_UCX_REF
ARG NIXL_GDRCOPY_REF ARG NIXL_GDRCOPY_REF
# Build and install gdrcopy # Build and install gdrcopy
RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \ RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
cd gdrcopy/packages && \ cd gdrcopy/packages && \
CUDA=/usr/local/cuda ./build-rpm-packages.sh && \ CUDA=/usr/local/cuda ./build-rpm-packages.sh && \
rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \ rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
...@@ -233,7 +249,7 @@ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \ ...@@ -233,7 +249,7 @@ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
ARG FFMPEG_VERSION ARG FFMPEG_VERSION
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \ export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
...@@ -273,7 +289,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -273,7 +289,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
# Build and install UCX # Build and install UCX
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \ export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
...@@ -327,7 +343,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -327,7 +343,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
ARG NIXL_LIBFABRIC_REF ARG NIXL_LIBFABRIC_REF
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \ export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
...@@ -359,7 +375,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -359,7 +375,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
ARG AWS_SDK_CPP_VERSION=1.11.760 ARG AWS_SDK_CPP_VERSION=1.11.760
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \ export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env cmake); \ eval $(/tmp/use-sccache.sh setup-env cmake); \
fi && \ fi && \
...@@ -397,7 +413,6 @@ COPY lib/ /opt/dynamo/lib/ ...@@ -397,7 +413,6 @@ COPY lib/ /opt/dynamo/lib/
COPY components/ /opt/dynamo/components/ COPY components/ /opt/dynamo/components/
# Build ai-dynamo (pure Python) and ai-dynamo-runtime (maturin) wheels # Build ai-dynamo (pure Python) and ai-dynamo-runtime (maturin) wheels
ARG ARCH
ARG USE_SCCACHE ARG USE_SCCACHE
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
...@@ -406,7 +421,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -406,7 +421,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=cache,target=/root/.cargo/git \ --mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
export UV_CACHE_DIR=/root/.cache/uv && \ export UV_CACHE_DIR=/root/.cache/uv && \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \ export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env cmake); \ eval $(/tmp/use-sccache.sh setup-env cmake); \
fi && \ fi && \
...@@ -460,8 +475,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -460,8 +475,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
FROM wheel_builder_base AS wheel_builder FROM wheel_builder_base AS wheel_builder
# Build and install nixl # Build and install nixl
ARG ARCH ARG TARGETARCH
ARG ARCH_ALT
ARG DEVICE ARG DEVICE
ARG NIXL_REF ARG NIXL_REF
ARG USE_SCCACHE ARG USE_SCCACHE
...@@ -471,7 +485,7 @@ ARG CUDA_MAJOR ...@@ -471,7 +485,7 @@ ARG CUDA_MAJOR
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \ export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
...@@ -502,11 +516,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -502,11 +516,12 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
/tmp/use-sccache.sh show-stats "NIXL" /tmp/use-sccache.sh show-stats "NIXL"
{% if device == "xpu" %} {% if device == "xpu" %}
ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu \ {# XPU only supports x86_64; no ARCH_ALT ARG needed #}
NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \ ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu \
NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu/plugins \
NIXL_PREFIX=/opt/intel/intel_nixl NIXL_PREFIX=/opt/intel/intel_nixl
{% else %} {% else %}
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \ ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \ NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
NIXL_PREFIX=/opt/nvidia/nvda_nixl NIXL_PREFIX=/opt/nvidia/nvda_nixl
{% endif %} {% endif %}
...@@ -523,7 +538,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -523,7 +538,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
export UV_CACHE_DIR=/root/.cache/uv && \ export UV_CACHE_DIR=/root/.cache/uv && \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \ export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env); \ eval $(/tmp/use-sccache.sh setup-env); \
fi && \ fi && \
...@@ -545,7 +560,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -545,7 +560,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=cache,target=/root/.cargo/git \ --mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
export UV_CACHE_DIR=/root/.cache/uv && \ export UV_CACHE_DIR=/root/.cache/uv && \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \ export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
if [ "$USE_SCCACHE" = "true" ]; then \ if [ "$USE_SCCACHE" = "true" ]; then \
eval $(/tmp/use-sccache.sh setup-env cmake); \ eval $(/tmp/use-sccache.sh setup-env cmake); \
fi && \ fi && \
......
...@@ -15,7 +15,7 @@ usage() { ...@@ -15,7 +15,7 @@ usage() {
Usage: $0 [COMMAND] [OPTIONS] Usage: $0 [COMMAND] [OPTIONS]
Commands: Commands:
install Install sccache binary (requires ARCH_ALT environment variable) install Install sccache binary (architecture auto-detected via uname -m)
setup-env Print export statements to configure sccache for compilation setup-env Print export statements to configure sccache for compilation
show-stats Display sccache statistics with optional build name show-stats Display sccache statistics with optional build name
help Show this help message help Show this help message
...@@ -33,10 +33,9 @@ Environment variables: ...@@ -33,10 +33,9 @@ Environment variables:
SCCACHE_BUCKET S3 bucket name (fallback if not passed as parameter) SCCACHE_BUCKET S3 bucket name (fallback if not passed as parameter)
SCCACHE_REGION S3 region (fallback if not passed as parameter) SCCACHE_REGION S3 region (fallback if not passed as parameter)
ARCH Architecture for S3 key prefix (fallback if not passed as parameter) ARCH Architecture for S3 key prefix (fallback if not passed as parameter)
ARCH_ALT Alternative architecture name for downloads (e.g., x86_64, aarch64)
Examples: Examples:
ARCH_ALT=x86_64 $0 install $0 install # architecture auto-detected via uname -m
eval \$($0 setup-env) # autotools / Meson eval \$($0 setup-env) # autotools / Meson
eval \$($0 setup-env cmake) # CMake builds eval \$($0 setup-env cmake) # CMake builds
$0 show-stats "UCX" $0 show-stats "UCX"
...@@ -44,18 +43,21 @@ EOF ...@@ -44,18 +43,21 @@ EOF
} }
install_sccache() { install_sccache() {
# Derive arch from TARGETARCH (set by BuildKit) with uname -m fallback
local arch_alt
if [ -n "${TARGETARCH:-}" ]; then
arch_alt=$([ "$TARGETARCH" = "amd64" ] && echo "x86_64" || echo "aarch64")
else
arch_alt=$(uname -m)
fi
if command -v sccache >/dev/null 2>&1; then if command -v sccache >/dev/null 2>&1; then
echo "sccache already installed at $(command -v sccache), skipping download" echo "sccache already installed at $(command -v sccache), skipping download"
else else
if [ -z "${ARCH_ALT:-}" ]; then echo "Installing sccache ${SCCACHE_VERSION} for architecture ${arch_alt}..."
echo "Error: ARCH_ALT environment variable is required for sccache installation"
exit 1
fi
echo "Installing sccache ${SCCACHE_VERSION} for architecture ${ARCH_ALT}..."
wget --tries=3 --waitretry=5 \ wget --tries=3 --waitretry=5 \
"https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl.tar.gz"
tar -xzf "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" tar -xzf "sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl.tar.gz"
mv "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl/sccache" /usr/local/bin/ mv "sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl/sccache" /usr/local/bin/
rm -rf sccache* rm -rf sccache*
fi fi
...@@ -174,4 +176,4 @@ main() { ...@@ -174,4 +176,4 @@ main() {
esac esac
} }
main "$@" main "$@"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment