dtk26.04.dockerfile 5.16 KB
Newer Older
one's avatar
one committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
ARG BASE_IMAGE=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204

FROM ${BASE_IMAGE}

# OS:
#   - Ubuntu: 22.04
#   - Docker Client: 20.10.8
# DTK:
#   - DTK: 26.04
# Lib:
#   - ucx: 1.20.0
#   - openmpi: 5.0.9
# Intel:
#   - mlc: v3.12

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
    apt-get -q install -y --no-install-recommends  \
    autoconf \
    automake \
    bc \
    build-essential \
    curl \
    dmidecode \
    git \
    iproute2 \
    jq \
    libaio-dev \
    libboost-program-options-dev \
    libcap2 \
    libcurl4-openssl-dev \
    libnuma-dev \
    libpci-dev \
    libssl-dev \
    libtinfo5 \
    libtool \
    lshw \
    net-tools \
    numactl \
    openssh-client \
    openssh-server \
    pciutils \
    rsync \
    sudo \
    util-linux \
    vim \
    wget \
    && \
    rm -rf /tmp/*

# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
    wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
    tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
    rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
    touch /root/.ssh/authorized_keys && \
    mkdir -p /var/run/sshd && \
    sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
    sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
    sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
    echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
    echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf

ENV ROCM_PATH=/opt/dtk

# Install UCX
ARG UCX_VERSION=1.20.0
ARG UCX_HOME=/opt/ucx
RUN cd /tmp && \
    wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz && \
    tar xzf ucx-${UCX_VERSION}.tar.gz && \
    cd ucx-${UCX_VERSION} && \
    ./contrib/configure-release --prefix=${UCX_HOME} \
80
81
82
83
84
85
    --enable-optimizations --enable-tuning \
    --enable-cma --enable-mt \
    --with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
    --with-verbs=/usr/include --with-rdmacm=/usr \
    --with-rocm=${ROCM_PATH} \
    --without-knem --without-cuda --without-java && \
one's avatar
one committed
86
    make -j $(nproc) && \
87
    rm -rf ${UCX_HOME} && \
one's avatar
one committed
88
89
90
91
92
93
94
95
96
97
98
    make install && \
    rm -rf /tmp/ucx-${UCX_VERSION}*

# Install OpenMPI
ENV MPI_HOME=/opt/mpi
ARG OMPI_VERSION=5.0.9
RUN cd /tmp && \
    wget https://download.open-mpi.org/release/open-mpi/v${OMPI_VERSION%.*}/openmpi-${OMPI_VERSION}.tar.gz && \
    tar xzf openmpi-${OMPI_VERSION}.tar.gz && \
    cd openmpi-${OMPI_VERSION} && \
    ./configure --prefix=${MPI_HOME} \
99
100
101
102
103
104
    --with-ucx=${UCX_HOME} \
    --with-rocm=${ROCM_PATH} \
    --enable-builtin-atomics \
    --enable-wrapper-rpath \
    --enable-mca-no-build=btl-uct \
    --enable-prte-prefix-by-default && \
one's avatar
one committed
105
    make -j $(nproc) && \
106
    rm -rf ${MPI_HOME} && \
one's avatar
one committed
107
108
109
110
111
112
    make install && \
    ldconfig && \
    cd / && \
    rm -rf /tmp/openmpi-${OMPI_VERSION}*

# Install Intel MLC
113
114
115
116
117
RUN cd /tmp && \
    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
    tar xzf mlc.tgz Linux/mlc && \
    cp ./Linux/mlc /usr/local/bin/ && \
    rm -rf ./Linux mlc.tgz
one's avatar
one committed
118
119

# Install AMD SMI Python Library
120
121
122
123
124
125
126
127
128
129
130
131
132
RUN cd /tmp && \
    wget -q https://github.com/ROCm/amdsmi/archive/refs/tags/rocm-5.7.0.tar.gz -O amdsmi.tar.gz && \
    tar xzf amdsmi.tar.gz --transform 's/amdsmi-rocm-5.7.0/amdsmi/' && \
    cd amdsmi && \
    cmake -S . -B build && \
    cmake --build build -j $(nproc) && \
    cmake --install build --prefix ${ROCM_PATH}/ && \
    rm -rf amdsmi.tar.gz amdsmi && \
    python3 -m pip install amdsmi==5.7.0

# Add rocblas-bench to path
RUN ln -s ${ROCM_PATH}/lib/rocblas/benchmark_tool/rocblas-bench ${ROCM_PATH}/bin/ && \
    chmod +x ${ROCM_PATH}/bin/rocblas-bench
one's avatar
one committed
133
134
135
136
137
138
139
140
141
142
143
144
145
146

ENV PATH="${MPI_HOME}/bin:${UCX_HOME}/bin:/opt/superbench/bin:/usr/local/bin/${PATH:+:${PATH}}" \
    LD_LIBRARY_PATH="${MPI_HOME}/lib:${UCX_HOME}/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" \
    SB_HOME=/opt/superbench \
    SB_MICRO_PATH=/opt/superbench \
    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

WORKDIR ${SB_HOME}

147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
COPY third_party third_party

RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
    make \
    RCCL_HOME=${ROCM_PATH}/rccl \
    ROCM_PATH=${ROCM_PATH} \
    HIP_HOME=${ROCM_PATH}/hip \
    MPI_HOME=${MPI_HOME} \
    -C third_party \
    dtk \
    -o cpu_hpl \
    -o cpu_stream \
    -o megatron_lm \
    -o apex_rocm \
    -o megatron_deepspeed \
    -o rocm_megatron_lm

COPY . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
    python3 -m pip install --upgrade pip wheel setuptools==65.7 mpi4py && \
    python3 -m pip install --no-build-isolation .[hgworker]  && \
    make cppbuild  && \
one's avatar
one committed
171
    make postinstall