Commit 0fdfe4c3 authored by one's avatar one
Browse files

Add a dtk dockerfile

parent 6b8e8104
ARG BASE_IMAGE=harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
FROM ${BASE_IMAGE}
# OS:
# - Ubuntu: 22.04
# - Docker Client: 20.10.8
# DTK:
# - DTK: 26.04
# Lib:
# - ucx: 1.20.0
# - openmpi: 5.0.9
# Intel:
# - mlc: v3.12
LABEL maintainer="SuperBench"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*
# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz
# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
ENV ROCM_PATH=/opt/dtk
# Install UCX
ARG UCX_VERSION=1.20.0
ARG UCX_HOME=/opt/ucx
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz && \
tar xzf ucx-${UCX_VERSION}.tar.gz && \
cd ucx-${UCX_VERSION} && \
./contrib/configure-release --prefix=${UCX_HOME} \
--enable-optimizations --enable-tuning \
--enable-cma --enable-mt \
--with-mlx5 --with-rc --with-ud --with-dc --with-dm --with-ib_hw_tm \
--with-verbs=/usr/include --with-rdmacm=/usr \
--with-rocm=${ROCM_PATH} \
--without-knem --without-cuda --without-java && \
make -j $(nproc) && \
make install && \
rm -rf /tmp/ucx-${UCX_VERSION}*
# Install OpenMPI
ENV MPI_HOME=/opt/mpi
ARG OMPI_VERSION=5.0.9
RUN cd /tmp && \
wget https://download.open-mpi.org/release/open-mpi/v${OMPI_VERSION%.*}/openmpi-${OMPI_VERSION}.tar.gz && \
tar xzf openmpi-${OMPI_VERSION}.tar.gz && \
cd openmpi-${OMPI_VERSION} && \
./configure --prefix=${MPI_HOME} \
--with-ucx=${UCX_HOME} \
--with-rocm=${ROCM_PATH} \
--enable-builtin-atomics \
--enable-wrapper-rpath \
--enable-mca-no-build=btl-uct \
--enable-prte-prefix-by-default && \
make -j $(nproc) && \
make install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OMPI_VERSION}*
# Install Intel MLC
# RUN cd /tmp && \
# wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
# tar xzf mlc.tgz Linux/mlc && \
# cp ./Linux/mlc /usr/local/bin/ && \
# rm -rf ./Linux mlc.tgz
# Install AMD SMI Python Library
RUN python3 -m pip install amdsmi==5.7.0
ENV PATH="${MPI_HOME}/bin:${UCX_HOME}/bin:/opt/superbench/bin:/usr/local/bin/${PATH:+:${PATH}}" \
LD_LIBRARY_PATH="${MPI_HOME}/lib:${UCX_HOME}/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
RUN python3 -m pip install --upgrade pip wheel setuptools==65.7 mpi4py
WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make RCCL_HOME=${ROCM_PATH}/rccl ROCM_PATH=${ROCM_PATH} HIP_HOME=${ROCM_PATH}/hip MPI_HOME=${MPI_HOME} -C third_party dtk -o cpu_hpl -o cpu_stream -o megatron_lm -o apex_rocm -o megatron_deepspeed -o rocm_megatron_lm
ADD . .
# ENV USE_HIP_DATATYPE=1
# ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install .[hgworker] && \
CXX=${ROCM_PATH}/bin/hipcc make cppbuild && \
make postinstall
...@@ -202,6 +202,7 @@ def run(self): ...@@ -202,6 +202,7 @@ def run(self):
'cpuworker': x['torch'], 'cpuworker': x['torch'],
'amdworker': x['torch'] + x['amd'], 'amdworker': x['torch'] + x['amd'],
'nvworker': x['torch'] + x['ort'] + x['nvidia'], 'nvworker': x['torch'] + x['ort'] + x['nvidia'],
'hgworker': x['amd'],
} }
)( )(
{ {
......
...@@ -16,7 +16,7 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") ...@@ -16,7 +16,7 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm .PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
# Build targets. # Build targets.
all: cuda rocm all: cuda rocm
...@@ -24,6 +24,7 @@ all: cuda rocm ...@@ -24,6 +24,7 @@ all: cuda rocm
cuda_with_msccl: cuda cuda_msccl cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
dtk: common rocm_perftest rocm_rccl_tests megatron_deepspeed apex_rocm rocm_megatron_lm
cpu: common cpu_perftest cpu: common cpu_perftest
common: fio cpu_stream common: fio cpu_stream
...@@ -113,30 +114,36 @@ ifneq (,$(wildcard nccl-tests/Makefile)) ...@@ -113,30 +114,36 @@ ifneq (,$(wildcard nccl-tests/Makefile))
endif endif
# Build perftest. # Build perftest.
# The commit we use is 4bee61f80d9e268fc97eaf40be00409e91d3a19e. # The commit we use is ea1c778782df3ec09b5f8101017fc0140b51a63d.
cuda_perftest: cuda_perftest:
ifneq (,$(wildcard perftest/autogen.sh)) ifneq (,$(wildcard perftest/autogen.sh))
cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install
endif endif
rocm_perftest: rocm_perftest:
ifneq (,$(wildcard perftest/autogen.sh)) ifneq (,$(wildcard perftest/autogen.sh))
cd perftest && ./autogen.sh && ./configure --enable-rocm --with-rocm=/opt/rocm --prefix=$(SB_MICRO_PATH) && make -j && make install cd perftest && ./autogen.sh && ./configure --enable-rocm --with-rocm=$(ROCM_PATH) --prefix=$(SB_MICRO_PATH) && make -j && make install
endif endif
cpu_perftest: cpu_perftest:
ifneq (,$(wildcard perftest/autogen.sh)) ifneq (,$(wildcard perftest/autogen.sh))
cd perftest && ./autogen.sh && ./configure --prefix=$(SB_MICRO_PATH) && make -j && make install cd perftest && ./autogen.sh && ./configure --prefix=$(SB_MICRO_PATH) && make -j && make install
endif endif
# Build FIO from commit d83ac9 (fio-3.28 tag). # Build FIO from commit ed675d347 (fio-3.41 tag).
fio: fio:
ifneq (,$(wildcard fio/Makefile)) ifneq (,$(wildcard fio/Makefile))
cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install
endif endif
# Build rccl-tests from commit 46375b1 of default branch. # Build rccl-tests from commit 66e513c of default branch.
rocm_rccl_tests: sb_micro_path rocm_rccl_tests: sb_micro_path
ifneq (, $(wildcard rccl-tests/Makefile)) ifneq (, $(wildcard rccl-tests/install.sh))
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j cd ./rccl-tests && \
ln -sf $$(which hipify-perl) $(ROCM_PATH)/bin/hipify-perl || true && \
./install.sh --mpi --mpi_home $(MPI_HOME) \
--rocm_home $(ROCM_PATH) \
--rccl_home $(RCCL_HOME) \
--hip_compiler hipcc \
--gpu_targets $$(paste -sd ',' $(ROCM_PATH)/bin/target.lst)
cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/ cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif endif
...@@ -225,13 +232,11 @@ directx_amf_encoding_latency: ...@@ -225,13 +232,11 @@ directx_amf_encoding_latency:
# Install requirements for Megatron-LM # Install requirements for Megatron-LM
megatron_lm: megatron_lm:
cd Megatron && \ cd Megatron && \
apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt python -m pip install --no-cache-dir -r requirements.txt
# Install requirements for Megatron-DeepSpeed # Install requirements for Megatron-DeepSpeed
megatron_deepspeed: megatron_deepspeed:
cd Megatron && \ cd Megatron && \
apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt && \ python -m pip install --no-cache-dir -r requirements.txt && \
python -m pip install DeepSpeed python -m pip install DeepSpeed
......
Subproject commit d83ac9d3d30d5f5dc6d0e425e4ba945a772839f6 Subproject commit ed675d3477a70a42d2e757b713f6c7125a27cdca
Subproject commit 4bee61f80d9e268fc97eaf40be00409e91d3a19e Subproject commit ea1c778782df3ec09b5f8101017fc0140b51a63d
Subproject commit 46375b1c527b2e3afe80fdd6dd136151bd939675 Subproject commit 66e513c24ff42394f5a0c1781f5868da7e094dd1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment