Unverified Commit 73505c77 authored by Kapil Arya's avatar Kapil Arya Committed by GitHub
Browse files

fix: correct Nixl plugin paths in Dockerfile. (#2048)

parent d65ce1b0
...@@ -38,8 +38,16 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base ...@@ -38,8 +38,16 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
# Redeclare ARCH and ARCH_ALT so they're available in this stage # Redeclare ARCH and ARCH_ALT so they're available in this stage
ARG ARCH ARG ARCH
ARG ARCH_ALT ARG ARCH_ALT
ARG NIXL_UCX_REF=v1.19.x
ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4
ENV NIXL_SRC_DIR=/opt/nixl
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
ENV LD_LIBRARY_PATH=$NIXL_LIB_DIR:$NIXL_PLUGIN_DIR:$LD_LIBRARY_PATH
USER root USER root
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
...@@ -65,31 +73,35 @@ RUN apt-get update -y && \ ...@@ -65,31 +73,35 @@ RUN apt-get update -y && \
WORKDIR /workspace WORKDIR /workspace
### UCX EFA Setup ### ### UCX EFA Setup ###
RUN rm -rf /opt/hpcx/ucx RUN rm -rf /opt/hpcx/ucx && \
RUN rm -rf /usr/local/ucx rm -rf /usr/local/ucx && \
RUN cd /usr/local/src && \ echo "Building UCX with reference $NIXL_UCX_REF" && \
cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \ git clone https://github.com/openucx/ucx.git && \
cd ucx && \ cd ucx && \
git checkout v1.19.x && \ git checkout $NIXL_UCX_REF && \
./autogen.sh && ./configure \ ./autogen.sh && ./configure \
--prefix=/usr/local/ucx \ --prefix=/usr/local/ucx \
--enable-shared \ --enable-shared \
--disable-static \ --disable-static \
--disable-doxygen-doc \ --disable-doxygen-doc \
--enable-optimizations \ --enable-optimizations \
--enable-cma \ --enable-cma \
--enable-devel-headers \ --enable-devel-headers \
--with-cuda=/usr/local/cuda \ --with-cuda=/usr/local/cuda \
--with-verbs \ --with-verbs \
--with-efa \ --with-efa \
--with-dm \ --with-dm \
--with-gdrcopy=/usr/local \ --with-gdrcopy=/usr/local \
--enable-mt && \ --enable-mt && \
make -j && \ make -j && \
make -j install-strip && \ make -j install-strip && \
ldconfig ldconfig
ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=\
/usr/lib:/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
ENV CPATH=/usr/include:$CPATH ENV CPATH=/usr/include:$CPATH
ENV PATH=/usr/bin:$PATH ENV PATH=/usr/bin:$PATH
ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
...@@ -98,25 +110,21 @@ SHELL ["/bin/bash", "-c"] ...@@ -98,25 +110,21 @@ SHELL ["/bin/bash", "-c"]
WORKDIR /workspace WORKDIR /workspace
### NIXL SETUP ### ### NIXL SETUP ###
# Clone nixl source, and checkout the nixl ref # Clone nixl source
RUN git clone "https://github.com/ai-dynamo/nixl.git" /opt/nixl && \ # TEMP: disable gds backend for arm64
cd /opt/nixl && \ RUN git clone "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \
git checkout ${NIXL_REF} cd ${NIXL_SRC_DIR} && \
RUN if [ "$ARCH" = "arm64" ]; then \ git checkout ${NIXL_REF} && \
cd /opt/nixl && \ if [ "$ARCH" = "arm64" ]; then \
mkdir build && \ nixl_build_args="-Ddisable_gds_backend=true -Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
meson setup build/ --prefix=/usr/local/nixl -Dgds_path=/usr/local/cuda/targets/sbsa-linux && \
cd build/ && \
ninja && \
ninja install; \
else \ else \
cd /opt/nixl && \ nixl_build_args=""; \
mkdir build && \ fi && \
meson setup build/ --prefix=/usr/local/nixl && \ mkdir build && \
cd build/ && \ meson setup build/ --buildtype=release --prefix=$NIXL_PREFIX $nixl_build_args && \
ninja && \ cd build/ && \
ninja install; \ ninja && \
fi ninja install;
### NATS & ETCD SETUP ### ### NATS & ETCD SETUP ###
# nats # nats
...@@ -143,11 +151,17 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv ...@@ -143,11 +151,17 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Install NIXL Python module # Install NIXL Python module
RUN cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl # TODO: Move gds_path selection based on arch into NIXL build
# TEMP: disable gds backend for arm64
# Install the wheel RUN if [ "$ARCH" = "arm64" ]; then \
# TODO: Move NIXL wheel install to the wheel_builder stage cd ${NIXL_SRC_DIR} && uv build . --out-dir /workspace/wheels/nixl \
RUN uv pip install /workspace/wheels/nixl/*.whl --config-settings=setup-args="-Ddisable_gds_backend=true -Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
else \
cd ${NIXL_SRC_DIR} && uv build . --out-dir /workspace/wheels/nixl; \
fi && \
# Install the wheel
# TODO: Move NIXL wheel install to the wheel_builder stage
uv pip install /workspace/wheels/nixl/*.whl
# Install sglang # Install sglang
#TODO: Built wheel should become an artifact which can be cached and reused in subsequent builds #TODO: Built wheel should become an artifact which can be cached and reused in subsequent builds
...@@ -276,6 +290,9 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} ...@@ -276,6 +290,9 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
# Use build arg RELEASE_BUILD = true to generate wheels for Python 3.10, 3.11 and 3.12. # Use build arg RELEASE_BUILD = true to generate wheels for Python 3.10, 3.11 and 3.12.
ARG RELEASE_BUILD ARG RELEASE_BUILD
# Keep in sync with the base image.
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
WORKDIR /workspace WORKDIR /workspace
RUN yum update -y \ RUN yum update -y \
...@@ -292,7 +309,7 @@ ENV RUSTUP_HOME=/usr/local/rustup \ ...@@ -292,7 +309,7 @@ ENV RUSTUP_HOME=/usr/local/rustup \
COPY --from=base $RUSTUP_HOME $RUSTUP_HOME COPY --from=base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=base $CARGO_HOME $CARGO_HOME COPY --from=base $CARGO_HOME $CARGO_HOME
COPY --from=base /usr/local/nixl /opt/nvidia/nvda_nixl COPY --from=base $NIXL_PREFIX $NIXL_PREFIX
COPY --from=base /workspace /workspace COPY --from=base /workspace /workspace
COPY --from=base $VIRTUAL_ENV $VIRTUAL_ENV COPY --from=base $VIRTUAL_ENV $VIRTUAL_ENV
ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH
...@@ -339,7 +356,8 @@ ENV CARGO_TARGET_DIR=/workspace/target ...@@ -339,7 +356,8 @@ ENV CARGO_TARGET_DIR=/workspace/target
WORKDIR /workspace WORKDIR /workspace
COPY --from=wheel_builder /workspace /workspace COPY --from=wheel_builder /workspace /workspace
COPY --from=wheel_builder /opt/nvidia/nvda_nixl /opt/nvidia/nvda_nixl COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
# Copy Cargo cache to avoid re-downloading dependencies # Copy Cargo cache to avoid re-downloading dependencies
COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME
...@@ -369,7 +387,6 @@ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/la ...@@ -369,7 +387,6 @@ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/la
# Tell vllm to use the Dynamo LLM C API for KV Cache Routing # Tell vllm to use the Dynamo LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH=/opt/dynamo/bindings/lib/libdynamo_llm_capi.so ENV VLLM_KV_CAPI_PATH=/opt/dynamo/bindings/lib/libdynamo_llm_capi.so
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu/
ENV PYTHONPATH=/workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang/utils:$PYTHONPATH ENV PYTHONPATH=/workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang/utils:$PYTHONPATH
...@@ -400,12 +417,21 @@ COPY --from=base /usr/local/bin/etcd/ /usr/local/bin/etcd/ ...@@ -400,12 +417,21 @@ COPY --from=base /usr/local/bin/etcd/ /usr/local/bin/etcd/
ENV PATH=/usr/local/bin/etcd/:$PATH ENV PATH=/usr/local/bin/etcd/:$PATH
# Copy UCX from base image as plugin for NIXL # Copy UCX from base image as plugin for NIXL
# Copy NIXL source from base image (required for NIXL plugins) # Copy NIXL source from wheel_builder image
COPY --from=base /usr/local/ucx /usr/local/ucx
COPY --from=base /usr/local/nixl /usr/local/nixl
ARG ARCH_ALT ARG ARCH_ALT
ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins:/usr/local/ucx/lib:$LD_LIBRARY_PATH ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
COPY --from=base /usr/local/ucx /usr/local/ucx
COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
# Setup the python environment # Setup the python environment
# libnuma-dev is a required dependency for sglang integration with NIXL # libnuma-dev is a required dependency for sglang integration with NIXL
......
...@@ -43,8 +43,16 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS build ...@@ -43,8 +43,16 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS build
# Redeclare ARCH and ARCH_ALT so they're available in this build stage # Redeclare ARCH and ARCH_ALT so they're available in this build stage
ARG ARCH ARG ARCH
ARG ARCH_ALT ARG ARCH_ALT
ARG NIXL_UCX_REF=v1.19.x
ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4
ENV NIXL_SRC_DIR=/opt/nixl
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
ENV LD_LIBRARY_PATH=$NIXL_LIB_DIR:$NIXL_PLUGIN_DIR:$LD_LIBRARY_PATH
USER root USER root
# Install utilities # Install utilities
...@@ -61,58 +69,56 @@ RUN apt update -y && \ ...@@ -61,58 +69,56 @@ RUN apt update -y && \
ninja-build ninja-build
### UCX EFA Setup ### ### UCX EFA Setup ###
RUN rm -rf /opt/hpcx/ucx RUN rm -rf /opt/hpcx/ucx && \
RUN rm -rf /usr/local/ucx rm -rf /usr/local/ucx && \
RUN cd /usr/local/src && \ echo "Building UCX with reference $NIXL_UCX_REF" && \
cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \ git clone https://github.com/openucx/ucx.git && \
cd ucx && \ cd ucx && \
git checkout v1.19.x && \ git checkout $NIXL_UCX_REF && \
./autogen.sh && ./configure \ ./autogen.sh && ./configure \
--prefix=/usr/local/ucx \ --prefix=/usr/local/ucx \
--enable-shared \ --enable-shared \
--disable-static \ --disable-static \
--disable-doxygen-doc \ --disable-doxygen-doc \
--enable-optimizations \ --enable-optimizations \
--enable-cma \ --enable-cma \
--enable-devel-headers \ --enable-devel-headers \
--with-cuda=/usr/local/cuda \ --with-cuda=/usr/local/cuda \
--with-verbs \ --with-verbs \
--with-efa \ --with-efa \
--with-dm \ --with-dm \
--with-gdrcopy=/usr/local \ --with-gdrcopy=/usr/local \
--enable-mt && \ --enable-mt && \
make -j && \ make -j && \
make -j install-strip && \ make -j install-strip && \
ldconfig ldconfig
ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=\
/usr/lib:/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
ENV CPATH=/usr/include:$CPATH ENV CPATH=/usr/include:$CPATH
ENV PATH=/usr/bin:$PATH ENV PATH=/usr/bin:$PATH
ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
SHELL ["/bin/bash", "-c"] SHELL ["/bin/bash", "-c"]
# NIXL SETUP ### NIXL SETUP ###
# Clone nixl source, and checkout the nixl ref # Clone nixl source
RUN git clone "https://github.com/ai-dynamo/nixl.git" /opt/nixl && \ # TEMP: disable gds backend for arm64
cd /opt/nixl && \ RUN git clone "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \
git checkout ${NIXL_REF} cd ${NIXL_SRC_DIR} && \
RUN if [ "$ARCH" = "arm64" ]; then \ git checkout ${NIXL_REF} && \
cd /opt/nixl && \ if [ "$ARCH" = "arm64" ]; then \
mkdir build && \ nixl_build_args="-Ddisable_gds_backend=true -Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
meson setup build/ --prefix=/usr/local/nixl -Dgds_path=/usr/local/cuda/targets/sbsa-linux && \
cd build/ && \
ninja && \
ninja install; \
else \ else \
cd /opt/nixl && \ nixl_build_args=""; \
mkdir build && \ fi && \
meson setup build/ --prefix=/usr/local/nixl && \ mkdir build && \
cd build/ && \ meson setup build/ --buildtype=release --prefix=$NIXL_PREFIX $nixl_build_args && \
ninja && \ cd build/ && \
ninja install; \ ninja && \
fi ninja install;
ENV NIXL_PREFIX=/usr/local/nixl
# nats # nats
RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \ RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \
...@@ -210,11 +216,17 @@ RUN mkdir /opt/dynamo && \ ...@@ -210,11 +216,17 @@ RUN mkdir /opt/dynamo && \
ENV VIRTUAL_ENV=/opt/dynamo/venv ENV VIRTUAL_ENV=/opt/dynamo/venv
# Install NIXL Python module # Install NIXL Python module
RUN cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl # TODO: Move gds_path selection based on arch into NIXL build
# TEMP: disable gds backend for arm64
# Install the wheel RUN if [ "$ARCH" = "arm64" ]; then \
# TODO: Move NIXL wheel install to the wheel_builder stage cd ${NIXL_SRC_DIR} && uv build . --out-dir /workspace/wheels/nixl \
RUN uv pip install /workspace/wheels/nixl/*.whl --config-settings=setup-args="-Ddisable_gds_backend=true -Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
else \
cd ${NIXL_SRC_DIR} && uv build . --out-dir /workspace/wheels/nixl; \
fi && \
# Install the wheel
# TODO: Move NIXL wheel install to the wheel_builder stage
uv pip install /workspace/wheels/nixl/*.whl
################################### ###################################
####### WHEEL BUILD STAGE ######### ####### WHEEL BUILD STAGE #########
...@@ -231,6 +243,9 @@ ARG CARGO_BUILD_JOBS ...@@ -231,6 +243,9 @@ ARG CARGO_BUILD_JOBS
# which might exceed the number of opened files limit. # which might exceed the number of opened files limit.
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
# Keep in sync with the base image.
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
WORKDIR /workspace WORKDIR /workspace
RUN yum update -y \ RUN yum update -y \
...@@ -246,9 +261,9 @@ ENV RUSTUP_HOME=/usr/local/rustup \ ...@@ -246,9 +261,9 @@ ENV RUSTUP_HOME=/usr/local/rustup \
COPY --from=build $RUSTUP_HOME $RUSTUP_HOME COPY --from=build $RUSTUP_HOME $RUSTUP_HOME
COPY --from=build $CARGO_HOME $CARGO_HOME COPY --from=build $CARGO_HOME $CARGO_HOME
COPY --from=build $NIXL_PREFIX $NIXL_PREFIX
COPY --from=build /workspace /workspace COPY --from=build /workspace /workspace
COPY --from=build $VIRTUAL_ENV $VIRTUAL_ENV COPY --from=build $VIRTUAL_ENV $VIRTUAL_ENV
COPY --from=build /usr/local/nixl /opt/nvidia/nvda_nixl
ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH
# Copy configuration files # Copy configuration files
...@@ -321,9 +336,6 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl && \ ...@@ -321,9 +336,6 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl && \
ENV DYNAMO_HOME=/workspace ENV DYNAMO_HOME=/workspace
ARG ARCH_ALT
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:$LD_LIBRARY_PATH
# Use UCX for TRTLLM KV Cache Transfer # Use UCX for TRTLLM KV Cache Transfer
ARG TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL ARG TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL
ENV TRTLLM_USE_UCX_KVCACHE=1 ENV TRTLLM_USE_UCX_KVCACHE=1
...@@ -383,9 +395,9 @@ COPY --from=build /usr/bin/nats-server /usr/bin/nats-server ...@@ -383,9 +395,9 @@ COPY --from=build /usr/bin/nats-server /usr/bin/nats-server
COPY --from=build /usr/local/bin/etcd/ /usr/local/bin/etcd/ COPY --from=build /usr/local/bin/etcd/ /usr/local/bin/etcd/
# Copy UCX from build image as plugin for NIXL # Copy UCX from build image as plugin for NIXL
# Copy NIXL source from wheel_builder image
COPY --from=build /usr/local/ucx /usr/local/ucx COPY --from=build /usr/local/ucx /usr/local/ucx
# Copy NIXL source from build image (required for NIXL plugins) COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --from=build /usr/local/nixl /usr/local/nixl
# Copy OpenMPI from build image # Copy OpenMPI from build image
COPY --from=build /opt/hpcx/ompi /opt/hpcx/ompi COPY --from=build /opt/hpcx/ompi /opt/hpcx/ompi
# Copy NUMA library from build image # Copy NUMA library from build image
...@@ -451,8 +463,18 @@ COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATT ...@@ -451,8 +463,18 @@ COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATT
COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/ COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/
# Setup environment variables # Setup environment variables
ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins ARG ARCH_ALT
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins:/usr/local/ucx/lib:/opt/hpcx/ompi/lib:$LD_LIBRARY_PATH ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
ENV LD_LIBRARY_PATH=\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
/opt/hpcx/ompi/lib:\
$LD_LIBRARY_PATH
ENV PATH=/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH ENV PATH=/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
ENV OPAL_PREFIX=/opt/hpcx/ompi ENV OPAL_PREFIX=/opt/hpcx/ompi
......
...@@ -81,6 +81,13 @@ RUN apt-get update -y && \ ...@@ -81,6 +81,13 @@ RUN apt-get update -y && \
ARG NIXL_UCX_REF=v1.19.x ARG NIXL_UCX_REF=v1.19.x
ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4
ENV NIXL_SRC_DIR=/opt/nixl
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ARG ARCH_ALT
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
ENV LD_LIBRARY_PATH=$NIXL_LIB_DIR:$NIXL_PLUGIN_DIR:$LD_LIBRARY_PATH
WORKDIR /workspace WORKDIR /workspace
### UCX EFA Setup ### ### UCX EFA Setup ###
...@@ -123,24 +130,19 @@ WORKDIR /workspace ...@@ -123,24 +130,19 @@ WORKDIR /workspace
### NIXL SETUP ### ### NIXL SETUP ###
# Clone nixl source # Clone nixl source
# TEMP: disable gds backend for arm64 # TEMP: disable gds backend for arm64
RUN git clone "https://github.com/ai-dynamo/nixl.git" /opt/nixl && \ RUN git clone "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \
cd /opt/nixl && \ cd ${NIXL_SRC_DIR} && \
git checkout ${NIXL_REF} && \ git checkout ${NIXL_REF} && \
if [ "$ARCH" = "arm64" ]; then \ if [ "$ARCH" = "arm64" ]; then \
cd /opt/nixl && \ nixl_build_args="-Ddisable_gds_backend=true -Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
mkdir build && \
meson setup build/ --buildtype=release --prefix=/usr/local/nixl -Ddisable_gds_backend=true -Dgds_path=/usr/local/cuda/targets/sbsa-linux && \
cd build/ && \
ninja && \
ninja install; \
else \ else \
cd /opt/nixl && \ nixl_build_args=""; \
mkdir build && \ fi && \
meson setup build/ --buildtype=release --prefix=/usr/local/nixl && \ mkdir build && \
cd build/ && \ meson setup build/ --buildtype=release --prefix=$NIXL_PREFIX $nixl_build_args && \
ninja && \ cd build/ && \
ninja install; \ ninja && \
fi ninja install;
### NATS & ETCD SETUP ### ### NATS & ETCD SETUP ###
ENV ETCD_VERSION="v3.5.21" ENV ETCD_VERSION="v3.5.21"
...@@ -168,11 +170,10 @@ ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ...@@ -168,11 +170,10 @@ ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# TODO: Move gds_path selection based on arch into NIXL build # TODO: Move gds_path selection based on arch into NIXL build
# TEMP: disable gds backend for arm64 # TEMP: disable gds backend for arm64
RUN if [ "$ARCH" = "arm64" ]; then \ RUN if [ "$ARCH" = "arm64" ]; then \
cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl \ cd ${NIXL_SRC_DIR} && uv build . --out-dir /workspace/wheels/nixl \
--config-settings=setup-args="-Ddisable_gds_backend=true" \ --config-settings=setup-args="-Ddisable_gds_backend=true -Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
--config-settings=setup-args="-Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
else \ else \
cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl; \ cd ${NIXL_SRC_DIR} && uv build . --out-dir /workspace/wheels/nixl; \
fi && \ fi && \
# Install the wheel # Install the wheel
# TODO: Move NIXL wheel install to the wheel_builder stage # TODO: Move NIXL wheel install to the wheel_builder stage
...@@ -315,6 +316,9 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} ...@@ -315,6 +316,9 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
# Use build arg RELEASE_BUILD = true to generate wheels for Python 3.10, 3.11 and 3.12. # Use build arg RELEASE_BUILD = true to generate wheels for Python 3.10, 3.11 and 3.12.
ARG RELEASE_BUILD ARG RELEASE_BUILD
# Keep in sync with the base image.
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
WORKDIR /workspace WORKDIR /workspace
RUN yum update -y \ RUN yum update -y \
...@@ -331,8 +335,7 @@ ENV RUSTUP_HOME=/usr/local/rustup \ ...@@ -331,8 +335,7 @@ ENV RUSTUP_HOME=/usr/local/rustup \
COPY --from=base $RUSTUP_HOME $RUSTUP_HOME COPY --from=base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=base $CARGO_HOME $CARGO_HOME COPY --from=base $CARGO_HOME $CARGO_HOME
# NIXL path default is NIXL_PREFIX=/opt/nvidia/nvda_nixl COPY --from=base $NIXL_PREFIX $NIXL_PREFIX
COPY --from=base /usr/local/nixl /opt/nvidia/nvda_nixl
COPY --from=base /workspace /workspace COPY --from=base /workspace /workspace
COPY --from=base $VIRTUAL_ENV $VIRTUAL_ENV COPY --from=base $VIRTUAL_ENV $VIRTUAL_ENV
ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH
...@@ -380,11 +383,7 @@ ENV CARGO_TARGET_DIR=/workspace/target ...@@ -380,11 +383,7 @@ ENV CARGO_TARGET_DIR=/workspace/target
WORKDIR /workspace WORKDIR /workspace
COPY --from=wheel_builder /workspace /workspace COPY --from=wheel_builder /workspace /workspace
COPY --from=wheel_builder /opt/nvidia/nvda_nixl /opt/nvidia/nvda_nixl COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
ARG ARCH_ALT
ENV LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu:\
/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugin:\
$LD_LIBRARY_PATH
# Copy Cargo cache to avoid re-downloading dependencies # Copy Cargo cache to avoid re-downloading dependencies
COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME
...@@ -439,6 +438,11 @@ ENV DYNAMO_HOME=/workspace ...@@ -439,6 +438,11 @@ ENV DYNAMO_HOME=/workspace
ENV VIRTUAL_ENV=/opt/dynamo/venv ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ARG ARCH_ALT
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Install build-essential and python3-dev as apt dependencies # Install build-essential and python3-dev as apt dependencies
RUN apt-get update && \ RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
...@@ -462,15 +466,14 @@ ENV PATH=/usr/local/bin/etcd/:$PATH ...@@ -462,15 +466,14 @@ ENV PATH=/usr/local/bin/etcd/:$PATH
# Copy UCX from base image as plugin for NIXL # Copy UCX from base image as plugin for NIXL
# Copy NIXL source from wheel_builder image # Copy NIXL source from wheel_builder image
COPY --from=base /usr/local/ucx /usr/local/ucx COPY --from=base /usr/local/ucx /usr/local/ucx
COPY --from=wheel_builder /opt/nvidia/nvda_nixl /opt/nvidia/nvda_nixl COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries # Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
COPY --from=base /opt/vllm /opt/vllm COPY --from=base /opt/vllm /opt/vllm
ARG ARCH_ALT
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu:\ $NIXL_LIB_DIR:\
/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugin:\ $NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\ /usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\ /usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH $LD_LIBRARY_PATH
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment