Unverified Commit 712bf9ec authored by ybyang's avatar ybyang Committed by GitHub
Browse files

[pd] optimize dockerfile for pd disaggregation (#7319)


Co-authored-by: default avatarzhyncs <me@zhyncs.com>
parent 9c6a0656
ARG BASE_IMAGE ARG BASE_IMAGE
FROM ${BASE_IMAGE} FROM ${BASE_IMAGE}
# Deps
RUN apt-get update && apt-get install -y netcat-openbsd \
libopenmpi-dev \
kmod \
rdma-core \
infiniband-diags \
openssh-server \
perftest \
ibverbs-providers \
libibumad3 \
libibverbs1 \
libnl-3-200 \
libnl-route-3-200 \
librdmacm1 \
build-essential \
cmake \
libibverbs-dev \
libgoogle-glog-dev \
libgtest-dev \
libjsoncpp-dev \
libnuma-dev \
libibverbs-dev \
libunwind-dev \
libgoogle-glog-dev \
libpython3-dev \
libboost-all-dev \
libssl-dev \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
pybind11-dev \
libhiredis-dev \
pkg-config \
patchelf \
ccache \
libcurl4-openssl-dev \
curl \
pkg-config libczmq4 libczmq-dev \
libnl-route-3-dev libnl-3-dev librdmacm1 \
libhiredis-dev \
nvidia-dkms-535 \
build-essential \
devscripts \
debhelper \
fakeroot \
dkms \
check \
libsubunit0 \
libsubunit-dev \
libfabric-dev \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s /usr/bin/python3 /usr/bin/python
# CMake # CMake
RUN apt-get update \ RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
&& apt-get install -y --no-install-recommends \
build-essential \
wget \
libssl-dev \
&& wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
&& chmod +x cmake-3.27.4-linux-x86_64.sh \ && chmod +x cmake-3.27.4-linux-x86_64.sh \
&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \ && ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \
&& rm cmake-3.27.4-linux-x86_64.sh && rm cmake-3.27.4-linux-x86_64.sh
# Python
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
python3 \
python3-pip \
&& ln -s /usr/bin/python3 /usr/bin/python
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
# GDRCopy # GDRCopy
WORKDIR /tmp RUN mkdir -p /tmp \
RUN git clone https://github.com/NVIDIA/gdrcopy.git && cd /tmp \
WORKDIR /tmp/gdrcopy && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
RUN git checkout v2.4.4 && cd /tmp/gdrcopy/packages \
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb \
&& dpkg -i libgdrapi_*.deb \
&& dpkg -i gdrcopy-tests_*.deb \
&& dpkg -i gdrcopy_*.deb
RUN apt update
RUN apt install -y nvidia-dkms-535
RUN apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
RUN apt install -y check libsubunit0 libsubunit-dev
WORKDIR /tmp/gdrcopy/packages
RUN CUDA=/usr/local/cuda ./build-deb-packages.sh
RUN dpkg -i gdrdrv-dkms_*.deb
RUN dpkg -i libgdrapi_*.deb
RUN dpkg -i gdrcopy-tests_*.deb
RUN dpkg -i gdrcopy_*.deb
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
# IBGDA dependency # IBGDA dependency
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
RUN apt-get install -y libfabric-dev
# DeepEP # DeepEP
WORKDIR /sgl-workspace WORKDIR /sgl-workspace
RUN git clone https://github.com/deepseek-ai/DeepEP.git RUN git clone https://github.com/deepseek-ai/DeepEP.git
# NVSHMEM # NVSHMEM
WORKDIR /sgl-workspace
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
RUN tar -xf nvshmem_src_3.2.5-1.txz \ RUN tar -xf nvshmem_src_3.2.5-1.txz \
&& mv nvshmem_src nvshmem && mv nvshmem_src nvshmem \
&& cd /sgl-workspace/nvshmem \
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
&& sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
&& cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
WORKDIR /sgl-workspace/nvshmem # Compile NVSHMEM
RUN git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch
RUN sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu && \
cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
WORKDIR /sgl-workspace/nvshmem
ENV CUDA_HOME=/usr/local/cuda ENV CUDA_HOME=/usr/local/cuda
RUN NVSHMEM_SHMEM_SUPPORT=0 \ RUN cd /sgl-workspace/nvshmem && NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \ NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \ NVSHMEM_MPI_SUPPORT=0 \
...@@ -77,5 +110,5 @@ WORKDIR /sgl-workspace/DeepEP ...@@ -77,5 +110,5 @@ WORKDIR /sgl-workspace/DeepEP
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages . RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages .
# Set workspace # Install mooncake transfer engine
WORKDIR /sgl-workspace RUN pip install --upgrade mooncake_transfer_engine --break-system-packages
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment