chore: Modify Docker templates to support multi arch builds (#6936)

da354663 · Ran Rubin · GitHub · 545fa300 · da354663 · da354663
Unverified Commit da354663 authored Mar 12, 2026 by Ran Rubin Committed by GitHub Mar 12, 2026
12 changed files
--- a/container/render.py
+++ b/container/render.py
@@ -9,6 +9,31 @@ from pathlib import Path
 import yaml
 from jinja2 import Environment, FileSystemLoader, StrictUndefined

+_VALID_ARCHS = {"amd64", "arm64"}
+
+
+def parse_platform(platform_str: str) -> str:
+    """Normalize a --platform value to the template variable used by Jinja2.
+
+    Accepts Docker-style values (linux/amd64, linux/arm64) or short form (amd64,
+    arm64), and comma-separated lists for multi-arch (linux/amd64,linux/arm64).
+
+    Returns one of: 'amd64', 'arm64', or 'multi'.
+
+    Raises ValueError for unrecognized architecture values.
+    """
+    parts = [p.strip() for p in platform_str.split(",")]
+    archs = [p.split("/")[-1] for p in parts]
+    for arch in archs:
+        if arch not in _VALID_ARCHS:
+            raise ValueError(
+                f"Unrecognized architecture '{arch}' in --platform '{platform_str}'. "
+                f"Valid architectures: {', '.join(sorted(_VALID_ARCHS))}"
+            )
+    if len(archs) > 1:
+        return "multi"
+    return archs[0]
+

 def parse_args():
    parser = argparse.ArgumentParser(
@@ -39,8 +64,15 @@ def parse_args():
    parser.add_argument(
        "--platform",
        type=str,
-        default="amd64",
-        help="Dockerfile platform to use. [amd64, arm64]",
+        default="linux/amd64",
+        help=(
+            "Target platform(s), Docker-style. Examples:\n"
+            "  linux/amd64            single-arch amd64 build\n"
+            "  linux/arm64            single-arch arm64 build\n"
+            "  linux/amd64,linux/arm64  multi-arch build; the rendered Dockerfile uses\n"
+            "                         Docker BuildX TARGETARCH directly (set per platform\n"
+            "                         by: docker buildx build --platform linux/amd64,linux/arm64)"
+        ),
    )
    parser.add_argument(
        "--cuda-version",
@@ -145,7 +177,7 @@ def render(args, context, script_dir):
        framework=args.framework,
        device=args.device,
        target=args.target,
-        platform=args.platform,
+        platform=args.platform,  # normalized: 'amd64', 'arm64', or 'multi'
        cuda_version=args.cuda_version,
        make_efa=args.make_efa,
    )
@@ -174,6 +206,9 @@ def render(args, context, script_dir):

 def main():
    args = parse_args()
+    # Normalize platform to template variable ('amd64', 'arm64', or 'multi')
+    # and store it back so render() and validate_args() both see the normalized form.
+    args.platform = parse_platform(args.platform)
    validate_args(args)
    # Clear cuda version for non-cuda device
    if args.device != "cuda":

--- a/container/templates/args.Dockerfile
+++ b/container/templates/args.Dockerfile
@@ -6,18 +6,14 @@
 ##########################
 #### Build Arguments #####
 ##########################
-# Define general architecture ARGs for supporting both x86 and aarch64 builds.
-#   ARCH: Used for package suffixes (e.g., amd64, arm64)
-#   ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
+# TARGETARCH is set automatically by Docker BuildKit for every --platform build.
+# It must NOT be declared in the global scope (before any FROM) — doing so shadows
+# the automatic per-platform value that BuildKit injects.
 #
-# Default values are for x86/amd64:
-#   --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
+# In each stage that needs it, re-declare with:  ARG TARGETARCH
 #
-# For arm64/aarch64, build with:
-#   --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
-#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg
-ARG ARCH={{ platform }}
-ARG ARCH_ALT={{ "x86_64" if platform == "amd64" else "aarch64" }}
+# ARCH_ALT (x86_64 / aarch64) is computed inline in RUN steps:
+#   ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64")
 ARG DEVICE={{ device }}
 {% if device == "cuda" -%}
 {% set device_key = device + cuda_version -%}
@@ -43,8 +39,11 @@ ARG RUNTIME_IMAGE_TAG={{ context[framework][device_key].runtime_image_tag }}
 # wheel builder image selection
 {% if device == "xpu" %}
 ARG WHEEL_BUILDER_IMAGE=${BASE_IMAGE}:${BASE_IMAGE_TAG}
+{% elif platform == "multi" %}
+{# Multi-arch: manylinux selection is handled via --platform-pinned stage aliases   #}
+{# in wheel_builder.Dockerfile using TARGETARCH. No static ARG needed here.         #}
 {% else %}
-ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_${ARCH_ALT}
+ARG WHEEL_BUILDER_IMAGE=quay.io/pypa/manylinux_2_28_{{ "x86_64" if platform == "amd64" else "aarch64" }}
 {% endif %}

 # Build configuration

--- a/container/templates/dev.Dockerfile
+++ b/container/templates/dev.Dockerfile
@@ -13,8 +13,7 @@
 #   pull those binaries/configs in via COPY.
 FROM runtime AS dynamo_tools

-ARG ARCH
-ARG ARCH_ALT
+ARG TARGETARCH
 ARG DEVICE

 ENV DEBIAN_FRONTEND=noninteractive
@@ -131,9 +130,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
 # Estimated layer size: ~500MB–1.5GB (nsight-systems is a full profiling suite)
 # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    wget -qO - "https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${ARCH}/nvidia.pub" \
+    wget -qO - "https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${TARGETARCH}/nvidia.pub" \
        | gpg --dearmor -o /etc/apt/keyrings/nvidia-devtools.gpg && \
-    echo "deb [signed-by=/etc/apt/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${ARCH} /" \
+    echo "deb [signed-by=/etc/apt/keyrings/nvidia-devtools.gpg] https://developer.download.nvidia.com/devtools/repos/ubuntu2404/${TARGETARCH} /" \
        | tee /etc/apt/sources.list.d/nvidia-devtools.list && \
    apt-get update && \
    apt-get install -y --no-install-recommends nsight-systems-2025.5.1 && \
@@ -193,7 +192,7 @@ RUN if [ ! -e /usr/bin/python3 ]; then \
 # wheels, but dev stage needs it for maturin develop and cargo build from source.
 # - SGLang: Copy NIXL/UCX/libfabric/gdrcopy binaries from wheel_builder (not in upstream lmsysorg/sglang runtime).
 # - vllm/trtllm/none: NIXL/UCX are already present in runtime (no-op).
-ARG ARCH_ALT
+ARG TARGETARCH
 RUN --mount=from=wheel_builder,target=/wheel_builder \
    if [ "${FRAMEWORK}" = "sglang" ]; then \
        if [ -d /wheel_builder/usr/local/ucx ] && [ -d /wheel_builder/opt/nvidia/nvda_nixl ]; then \
@@ -204,20 +203,16 @@ RUN --mount=from=wheel_builder,target=/wheel_builder \
            cp /wheel_builder/usr/include/gdrapi.h /usr/include/; \
            cp /wheel_builder/usr/lib64/libgdrapi.so* /usr/lib64/; \
            echo "/usr/lib64" >> /etc/ld.so.conf.d/gdrcopy.conf; \
-            # SGLang expects ARCH-qualified lib paths; mirror lib64 into lib/${ARCH_ALT}-linux-gnu for parity.
-            if [ -d /opt/nvidia/nvda_nixl/lib64 ]; then \
-                mkdir -p /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu; \
-                cp -r /opt/nvidia/nvda_nixl/lib64/. /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/; \
-            fi; \
        fi; \
    fi

-# All frameworks use the same path pattern: /opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu
-# For vllm/trtllm/none: This resets the same values already set in runtime (no harm)
-# For sglang: This sets them for the first time (required)
+# NIXL is installed under lib64 (manylinux/AlmaLinux convention used by the wheel_builder).
+# All frameworks reference NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64.
+# For vllm/trtllm/none: This resets the same values already set in runtime (no harm).
+# For sglang: This sets them for the first time (required).
 ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \
-    NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \
-    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins
+    NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
+    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins

 # Set universal CUDA development environment variables (all frameworks)
 # vLLM: Dockerfile.vllm line 533, 597

--- a/container/templates/dynamo_base.Dockerfile
+++ b/container/templates/dynamo_base.Dockerfile
@@ -9,8 +9,7 @@

 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dynamo_base

-ARG ARCH
-ARG ARCH_ALT
+ARG TARGETARCH

 USER root
 WORKDIR /opt/dynamo
@@ -18,7 +17,8 @@ WORKDIR /opt/dynamo
 # Install sccache into the base image so downstream stages can COPY it
 # instead of downloading from GitHub (avoids 502 errors under parallel builds)
 ARG SCCACHE_VERSION=v0.14.0
-RUN wget --tries=3 --waitretry=5 \
+RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
+    wget --tries=3 --waitretry=5 \
        "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \
    tar -xzf "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \
    mv "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl/sccache" /usr/local/bin/ && \
@@ -31,29 +31,27 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 # Install NATS server
 ARG NATS_VERSION
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${ARCH}.deb && \
-    dpkg -i nats-server-${NATS_VERSION}-${ARCH}.deb && rm nats-server-${NATS_VERSION}-${ARCH}.deb
+    wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${TARGETARCH}.deb && \
+    dpkg -i nats-server-${NATS_VERSION}-${TARGETARCH}.deb && rm nats-server-${NATS_VERSION}-${TARGETARCH}.deb

 # Install etcd
 ARG ETCD_VERSION
-RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
+RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${TARGETARCH}.tar.gz -O /tmp/etcd.tar.gz && \
    mkdir -p /usr/local/bin/etcd && \
    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
    rm /tmp/etcd.tar.gz
 ENV PATH=/usr/local/bin/etcd/:$PATH

 # Rust Setup
-# Rust environment setup
 ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    PATH=/usr/local/cargo/bin:$PATH \
    RUST_VERSION=1.93.1

-# Define Rust target based on ARCH_ALT ARG
-ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu
-
-# Install Rust
-RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
+# Install Rust — ARCH_ALT (x86_64/aarch64) is derived from TARGETARCH at build time
+RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
+    RUSTARCH="${ARCH_ALT}-unknown-linux-gnu" && \
+    wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
    rm rustup-init && \

--- a/container/templates/dynamo_runtime.Dockerfile
+++ b/container/templates/dynamo_runtime.Dockerfile
@@ -9,7 +9,6 @@

 FROM dynamo_base AS runtime

-ARG ARCH_ALT
 ARG PYTHON_VERSION

 # Create dynamo user with group 0 for OpenShift compatibility
@@ -26,8 +25,8 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \

 # NIXL environment variables
 ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \
-    NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \
-    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \
+    NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
+    NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
    CARGO_TARGET_DIR=/opt/dynamo/target

 ENV LD_LIBRARY_PATH=\
@@ -40,7 +39,6 @@ ${LD_LIBRARY_PATH}
 # Copy ucx and nixl libs
 COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx/ /usr/local/ucx/
 COPY --chown=dynamo: --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/
-COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
 COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
 COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/


--- a/container/templates/sglang_runtime.Dockerfile
+++ b/container/templates/sglang_runtime.Dockerfile
@@ -18,7 +18,7 @@ RUN apt remove -y python3-apt python3-blinker && \

 # This ARG is still utilized for SGLANG Version extraction
 ARG RUNTIME_IMAGE_TAG
-ARG ARCH_ALT
+ARG TARGETARCH
 WORKDIR /workspace

 # Install NATS and ETCD
@@ -68,13 +68,12 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src

 # NIXL environment and native libraries
 ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
-ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
+ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
 ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins

 # Copy UCX and NIXL native libraries to system directories
 COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
 COPY --chown=dynamo:0 --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
-COPY --chown=dynamo:0 --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/

 ENV PATH=/usr/local/ucx/bin:$PATH


--- a/container/templates/trtllm_framework.Dockerfile
+++ b/container/templates/trtllm_framework.Dockerfile
@@ -29,7 +29,7 @@ FROM ${TRTLLM_WHEEL_IMAGE} AS trtllm_wheel_image

 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS framework

-ARG ARCH_ALT
+ARG TARGETARCH
 COPY --from=dynamo_base /bin/uv /bin/uvx /bin/

 # Install minimal dependencies needed for TensorRT-LLM installation
@@ -149,6 +149,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
            TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
            PYTHON_TAG="cp$(echo ${PYTHON_VERSION} | tr -d '.')"; \
+            ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
            DIRECT_URL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-${TRTLLM_VERSION}-${PYTHON_TAG}-${PYTHON_TAG}-linux_${ARCH_ALT}.whl"; \
            uv pip install --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${DIRECT_URL}" triton==3.5.1; \
        else \

--- a/container/templates/trtllm_runtime.Dockerfile
+++ b/container/templates/trtllm_runtime.Dockerfile
@@ -24,7 +24,7 @@

 FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime

-ARG ARCH_ALT
+ARG TARGETARCH
 WORKDIR /workspace
 ENV ENV=${ENV:-/etc/shinit_v2}
 ENV VIRTUAL_ENV=/opt/dynamo/venv
@@ -56,8 +56,11 @@ ENV CUDA_HOME=/usr/local/cuda \

 # Copy OpenMPI from PyTorch base image
 COPY --from=pytorch_base /opt/hpcx/ompi /opt/hpcx/ompi
-# Copy NUMA library from PyTorch base image
-COPY --from=pytorch_base /usr/lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/
+# Copy NUMA library from PyTorch base image (arch-dependent path)
+RUN --mount=type=bind,from=pytorch_base,source=/usr/lib,target=/mnt/usr_lib \
+    ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
+    mkdir -p /usr/lib/${ARCH_ALT}-linux-gnu && \
+    cp /mnt/usr_lib/${ARCH_ALT}-linux-gnu/libnuma.so* /usr/lib/${ARCH_ALT}-linux-gnu/

 # Copy UCX libraries, libucc.so is needed by pytorch. May not need to copy whole hpcx dir but only /opt/hpcx/ucc/
 COPY --from=pytorch_base /opt/hpcx /opt/hpcx
@@ -93,6 +96,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
 # Cache apt downloads; sharing=locked avoids apt/dpkg races with concurrent builds.
 ARG PYTHON_VERSION
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
    if [ ${ARCH_ALT} = "x86_64" ]; then \
        ARCH_FOR_GPG=${ARCH_ALT}; \
    else \
@@ -160,6 +164,19 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
 {% endif %}

+# Copy TensorRT and libgomp from framework image (arch-dependent path, needs root)
+COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt
+RUN --mount=type=bind,from=framework,source=/usr/lib,target=/mnt/usr_lib \
+    ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
+    cp /mnt/usr_lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/
+
+# Register arch-dependent TensorRT and nvshmem library paths with ldconfig so the
+# dynamic linker finds them in every execution context (docker run, exec, k8s, etc.)
+RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
+    echo "/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib" > /etc/ld.so.conf.d/tensorrt.conf && \
+    echo "/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13" >> /etc/ld.so.conf.d/tensorrt.conf && \
+    ldconfig
+
 # Switch to dynamo user
 USER dynamo
 ENV HOME=/home/dynamo
@@ -168,13 +185,9 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]

 ENV DYNAMO_HOME=/workspace
 ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
-ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
+ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
 ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins

-# Copy libgomp.so from framework image
-COPY --from=framework /usr/local/tensorrt /usr/local/tensorrt
-COPY --from=framework /usr/lib/${ARCH_ALT}-linux-gnu/libgomp.so* /usr/lib/${ARCH_ALT}-linux-gnu/
-
 # Copy pre-built venv with PyTorch and TensorRT-LLM from framework stage
 # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
 COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
@@ -184,20 +197,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
 # Copy dynamo wheels for gitlab artifacts (read-only, no group-write needed)
 COPY --chown=dynamo: --from=wheel_builder /usr/local/ucx /usr/local/ucx
 COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
-COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
 COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
 COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/

-ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib
 ENV PATH="/usr/local/ucx/bin:${VIRTUAL_ENV}/bin:/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin:$PATH"
+# Both arch paths are listed; the non-existent one is silently ignored by the linker.
 ENV LD_LIBRARY_PATH=\
 $NIXL_LIB_DIR:\
 $NIXL_PLUGIN_DIR:\
 /usr/local/ucx/lib:\
 /usr/local/ucx/lib/ucx:\
 /opt/hpcx/ompi/lib:\
-/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:\
-$TENSORRT_LIB_DIR:\
+/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:\
+/usr/local/tensorrt/targets/aarch64-linux-gnu/lib:\
+/usr/lib/x86_64-linux-gnu/nvshmem/13/:\
+/usr/lib/aarch64-linux-gnu/nvshmem/13/:\
 /opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch/lib:\
 /opt/dynamo/venv/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt/lib:\
 /usr/local/cuda/lib:\

--- a/container/templates/vllm_framework.Dockerfile
+++ b/container/templates/vllm_framework.Dockerfile
@@ -61,7 +61,7 @@ RUN mkdir -p /opt/dynamo/venv && \
 ENV VIRTUAL_ENV=/opt/dynamo/venv \
    PATH="/opt/dynamo/venv/bin:${PATH}"

-ARG ARCH
+ARG TARGETARCH
 # Install vllm - keep this early in Dockerfile to avoid
 # rebuilds from unrelated source code changes
 ARG VLLM_REF
@@ -98,7 +98,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
        --device $DEVICE \
        --vllm-ref $VLLM_REF \
        --max-jobs $MAX_JOBS \
-        --arch $ARCH \
+        --arch $TARGETARCH \
        --installation-dir /opt \
        ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
        ${VLLM_OMNI_REF:+--vllm-omni-ref "$VLLM_OMNI_REF"} \

--- a/container/templates/vllm_runtime.Dockerfile
+++ b/container/templates/vllm_runtime.Dockerfile
@@ -89,7 +89,6 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
    # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
    && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh

-ARG ARCH_ALT
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=${PYTHON_VERSION}

@@ -171,11 +170,11 @@ SHELL ["/bin/bash", "-l", "-o", "pipefail", "-c"]

 {% if device == "xpu" %}
 ENV NIXL_PREFIX=/opt/intel/intel_nixl
-ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
+ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/x86_64-linux-gnu
 ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
 {% else %}
 ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
-ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
+ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib64
 ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
 {% endif %}

@@ -222,20 +221,13 @@ COPY --chown=dynamo:0 --from=framework /opt/vllm /opt/vllm
 COPY --from=wheel_builder /usr/local/ucx /usr/local/ucx
 COPY --chown=dynamo: --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
 {% if device == "xpu" %}
-COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/. ${NIXL_LIB_DIR}/
-{% else %}
-COPY --chown=dynamo: --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
+{# XPU NIXL uses lib/x86_64-linux-gnu; copy to NIXL_LIB_DIR to ensure lib dir is populated #}
+COPY --chown=dynamo: --from=wheel_builder /opt/intel/intel_nixl/lib/x86_64-linux-gnu/. ${NIXL_LIB_DIR}/
 {% endif %}
+{# For cuda: NIXL_LIB_DIR = lib64, already included in the $NIXL_PREFIX COPY above #}
 COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
 COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/

-{% if device == "cuda" %}
-# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
-COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
-COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
-COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
-COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
-{% endif %}

 ENV PATH=/usr/local/ucx/bin:$PATH

@@ -347,6 +339,12 @@ RUN chmod g+w /workspace /workspace/* /opt/dynamo /opt/dynamo/* ${VIRTUAL_ENV} &
    echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc

 {% if device == "cuda" %}
+# Copy AWS SDK C++ libraries (required for NIXL OBJ backend / S3 support)
+COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libaws* /usr/local/lib/
+COPY --chown=dynamo: --from=wheel_builder /usr/local/lib64/libs2n* /usr/local/lib/
+COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libcrypto.so.1.1* /usr/local/lib/
+COPY --chown=dynamo: --from=wheel_builder /usr/lib64/libssl.so.1.1* /usr/local/lib/
+
 # Fix library symlinks that Docker COPY dereferenced (COPY always follows symlinks)
 # This recreates proper symlinks to save space and suppress ldconfig warnings
 RUN cd /usr/local/lib && \

--- a/container/templates/wheel_builder.Dockerfile
+++ b/container/templates/wheel_builder.Dockerfile
@@ -7,16 +7,28 @@
 ##### Wheel Build Image ##########
 ##################################

+{% if platform == "multi" and device == "cuda" %}
+# Multi-arch: declare both manylinux base images with explicit --platform so each is
+# always pulled as the correct native arch regardless of the current TARGETPLATFORM.
+# BuildKit only fetches and builds the stage that TARGETARCH resolves to; the other
+# is a no-op for each sub-build.
+FROM --platform=linux/amd64 quay.io/pypa/manylinux_2_28_x86_64 AS manylinux_amd64
+FROM --platform=linux/arm64 quay.io/pypa/manylinux_2_28_aarch64 AS manylinux_arm64
+{% endif %}
+
 ##################################
 ##### wheel_builder_base #########
 ##################################
 # Shared base for all wheel builds: tools, system deps, and native libraries (except nixl).

+{% if platform == "multi" and device == "cuda" %}
+FROM manylinux_${TARGETARCH} AS wheel_builder_base
+{% else %}
 FROM ${WHEEL_BUILDER_IMAGE} AS wheel_builder_base
+{% endif %}

 # Redeclare ARGs for this stage
-ARG ARCH
-ARG ARCH_ALT
+ARG TARGETARCH
 ARG CARGO_BUILD_JOBS
 ARG DEVICE

@@ -35,6 +47,8 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
    CARGO_TARGET_DIR=/opt/dynamo/target \
    PATH=/usr/local/cargo/bin:$PATH

+
+
 # Copy artifacts from base stage
 COPY --from=dynamo_base $RUSTUP_HOME $RUSTUP_HOME
 COPY --from=dynamo_base $CARGO_HOME $CARGO_HOME
@@ -158,6 +172,7 @@ ENV PATH="/opt/rh/gcc-toolset-14/root/usr/bin:${PATH}" \

 # Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
 RUN set -eux; \
+    ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64"); \
    PROTOC_VERSION=25.3; \
    case "${ARCH_ALT}" in \
      x86_64) PROTOC_ZIP="protoc-${PROTOC_VERSION}-linux-x86_64.zip" ;; \
@@ -200,7 +215,8 @@ ARG NIXL_UCX_REF
 ARG NIXL_GDRCOPY_REF

 # Build and install gdrcopy
-RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
+RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
+    git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
    cd gdrcopy/packages && \
    CUDA=/usr/local/cuda ./build-rpm-packages.sh && \
    rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
@@ -233,7 +249,7 @@ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
 ARG FFMPEG_VERSION
 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
-    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
+    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env); \
    fi && \
@@ -273,7 +289,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
 # Build and install UCX
 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
-    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
+    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env); \
    fi && \
@@ -327,7 +343,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
 ARG NIXL_LIBFABRIC_REF
 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
-    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
+    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env); \
    fi && \
@@ -359,7 +375,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
 ARG AWS_SDK_CPP_VERSION=1.11.760
 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
-    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
+    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env cmake); \
    fi && \
@@ -397,7 +413,6 @@ COPY lib/ /opt/dynamo/lib/
 COPY components/ /opt/dynamo/components/

 # Build ai-dynamo (pure Python) and ai-dynamo-runtime (maturin) wheels
-ARG ARCH
 ARG USE_SCCACHE
 ARG ENABLE_MEDIA_FFMPEG
 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
@@ -406,7 +421,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=cache,target=/root/.cargo/git \
    --mount=type=cache,target=/root/.cache/uv \
    export UV_CACHE_DIR=/root/.cache/uv && \
-    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
+    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env cmake); \
    fi && \
@@ -460,8 +475,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM wheel_builder_base AS wheel_builder

 # Build and install nixl
-ARG ARCH
-ARG ARCH_ALT
+ARG TARGETARCH
 ARG DEVICE
 ARG NIXL_REF
 ARG USE_SCCACHE
@@ -471,7 +485,7 @@ ARG CUDA_MAJOR

 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
-    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
+    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env); \
    fi && \
@@ -502,8 +516,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    /tmp/use-sccache.sh show-stats "NIXL"

 {% if device == "xpu" %}
-ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu  \
-    NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/${ARCH_ALT}-linux-gnu/plugins \
+{# XPU only supports x86_64; no ARCH_ALT ARG needed #}
+ENV NIXL_LIB_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu \
+    NIXL_PLUGIN_DIR=/opt/intel/intel_nixl/lib/x86_64-linux-gnu/plugins \
    NIXL_PREFIX=/opt/intel/intel_nixl
 {% else %}
 ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
@@ -523,7 +538,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    --mount=type=cache,target=/root/.cache/uv \
    export UV_CACHE_DIR=/root/.cache/uv && \
-    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
+    export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}}" && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env); \
    fi && \
@@ -545,7 +560,8 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=cache,target=/root/.cargo/git \
    --mount=type=cache,target=/root/.cache/uv \
    export UV_CACHE_DIR=/root/.cache/uv && \
-    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
+    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${TARGETARCH}} && \
+    ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
    if [ "$USE_SCCACHE" = "true" ]; then \
        eval $(/tmp/use-sccache.sh setup-env cmake); \
    fi && \

--- a/container/use-sccache.sh
+++ b/container/use-sccache.sh
@@ -15,7 +15,7 @@ usage() {
 Usage: $0 [COMMAND] [OPTIONS]

 Commands:
-    install         Install sccache binary (requires ARCH_ALT environment variable)
+    install         Install sccache binary (architecture auto-detected via uname -m)
    setup-env       Print export statements to configure sccache for compilation
    show-stats      Display sccache statistics with optional build name
    help            Show this help message
@@ -33,10 +33,9 @@ Environment variables:
    SCCACHE_BUCKET          S3 bucket name (fallback if not passed as parameter)
    SCCACHE_REGION          S3 region (fallback if not passed as parameter)
    ARCH                    Architecture for S3 key prefix (fallback if not passed as parameter)
-    ARCH_ALT                Alternative architecture name for downloads (e.g., x86_64, aarch64)

 Examples:
-    ARCH_ALT=x86_64 $0 install
+    $0 install                     # architecture auto-detected via uname -m
    eval \$($0 setup-env)          # autotools / Meson
    eval \$($0 setup-env cmake)    # CMake builds
    $0 show-stats "UCX"
@@ -44,18 +43,21 @@ EOF
 }

 install_sccache() {
+    # Derive arch from TARGETARCH (set by BuildKit) with uname -m fallback
+    local arch_alt
+    if [ -n "${TARGETARCH:-}" ]; then
+        arch_alt=$([ "$TARGETARCH" = "amd64" ] && echo "x86_64" || echo "aarch64")
+    else
+        arch_alt=$(uname -m)
+    fi
    if command -v sccache >/dev/null 2>&1; then
        echo "sccache already installed at $(command -v sccache), skipping download"
    else
-        if [ -z "${ARCH_ALT:-}" ]; then
-            echo "Error: ARCH_ALT environment variable is required for sccache installation"
-            exit 1
-        fi
-        echo "Installing sccache ${SCCACHE_VERSION} for architecture ${ARCH_ALT}..."
+        echo "Installing sccache ${SCCACHE_VERSION} for architecture ${arch_alt}..."
        wget --tries=3 --waitretry=5 \
-            "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz"
-        tar -xzf "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz"
-        mv "sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl/sccache" /usr/local/bin/
+            "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl.tar.gz"
+        tar -xzf "sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl.tar.gz"
+        mv "sccache-${SCCACHE_VERSION}-${arch_alt}-unknown-linux-musl/sccache" /usr/local/bin/
        rm -rf sccache*
    fi