Dockerfile.deepep 4.17 KB
Newer Older
1
2
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Deps
RUN apt-get update &&  apt-get install -y netcat-openbsd  \
                        libopenmpi-dev \
                        kmod \
                        rdma-core  \
                        infiniband-diags \
                        openssh-server \
                        perftest \
                        ibverbs-providers \
                        libibumad3 \
                        libibverbs1 \
                        libnl-3-200 \
                        libnl-route-3-200 \
                        librdmacm1  \
                        build-essential \
                        cmake \
                        libibverbs-dev \
                        libgoogle-glog-dev \
                        libgtest-dev \
                        libjsoncpp-dev \
                        libnuma-dev \
                        libibverbs-dev \
                        libunwind-dev \
                        libgoogle-glog-dev \
                        libpython3-dev \
                        libboost-all-dev \
                        libssl-dev \
                        libgrpc-dev \
                        libgrpc++-dev \
                        libprotobuf-dev \
                        protobuf-compiler-grpc \
                        pybind11-dev \
                        libhiredis-dev \
                        pkg-config \
                        patchelf \
                        ccache \
                        libcurl4-openssl-dev \
                        curl \
                        pkg-config libczmq4 libczmq-dev \
                        libnl-route-3-dev libnl-3-dev librdmacm1 \
                        libhiredis-dev  \
                        nvidia-dkms-535 \
                        build-essential \
                        devscripts \
                        debhelper \
                        fakeroot \
                        dkms \
                        check \
                        libsubunit0 \
                        libsubunit-dev \
                        libfabric-dev \
                        python3 \
                        python3-pip \
                        && rm -rf /var/lib/apt/lists/* \
                        && ln -s /usr/bin/python3 /usr/bin/python
58
59

# CMake
60
RUN  wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
61
62
63
64
65
&& chmod +x cmake-3.27.4-linux-x86_64.sh \
&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \
&& rm cmake-3.27.4-linux-x86_64.sh


66
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
67
# GDRCopy
68
69
70
71
72
73
74
75
76
RUN  mkdir -p /tmp \
    &&  cd /tmp \
    && git clone https://github.com/NVIDIA/gdrcopy.git  -b v2.4.4 \
    && cd /tmp/gdrcopy/packages \
    && CUDA=/usr/local/cuda ./build-deb-packages.sh \
    && dpkg -i gdrdrv-dkms_*.deb \
    && dpkg -i libgdrapi_*.deb \
    && dpkg -i gdrcopy-tests_*.deb \
    && dpkg -i gdrcopy_*.deb
77
78
79


# IBGDA dependency
80
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
81
82
83
84
85
86
87
88

# DeepEP
WORKDIR /sgl-workspace
RUN git clone https://github.com/deepseek-ai/DeepEP.git

# NVSHMEM
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
RUN tar -xf nvshmem_src_3.2.5-1.txz \
89
90
91
92
93
    && mv nvshmem_src nvshmem \
    && cd /sgl-workspace/nvshmem \
    && git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
    && sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
    && cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
94

95
# Compile NVSHMEM
96
ENV CUDA_HOME=/usr/local/cuda
97
RUN cd /sgl-workspace/nvshmem && NVSHMEM_SHMEM_SUPPORT=0 \
98
99
100
101
102
103
104
105
106
107
108
109
110
    NVSHMEM_UCX_SUPPORT=0 \
    NVSHMEM_USE_NCCL=0 \
    NVSHMEM_MPI_SUPPORT=0 \
    NVSHMEM_IBGDA_SUPPORT=1 \
    NVSHMEM_PMIX_SUPPORT=0 \
    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
    NVSHMEM_USE_GDRCOPY=1 \
    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/sgl-workspace/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \
    && cd build \
    && make install -j

WORKDIR /sgl-workspace/DeepEP
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
111
RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages .
112

113
114
# Install mooncake transfer engine
RUN pip install  --upgrade mooncake_transfer_engine --break-system-packages