Unverified Commit b85f6851 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Dockerfile - Bug fix for rocm docker build and deploy (#598)

**Description**
Bug fix for rocm docker build and deploy.
parent 32ed692e
...@@ -108,7 +108,7 @@ jobs: ...@@ -108,7 +108,7 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }} password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Pull cache image - name: Pull cache image
run: sudo docker pull ${{ steps.metadata.outputs.tags }} run: sudo docker pull $(cut -d, -f1 <<<${{ steps.metadata.outputs.tags }})
continue-on-error: true continue-on-error: true
- name: Login to the GitHub Container Registry - name: Login to the GitHub Container Registry
uses: docker/login-action@v1 uses: docker/login-action@v1
......
...@@ -17,6 +17,7 @@ RUN apt-get update && \ ...@@ -17,6 +17,7 @@ RUN apt-get update && \
apt-get -q install -y --no-install-recommends \ apt-get -q install -y --no-install-recommends \
autoconf \ autoconf \
automake \ automake \
bc \
build-essential \ build-essential \
curl \ curl \
dmidecode \ dmidecode \
...@@ -27,6 +28,7 @@ RUN apt-get update && \ ...@@ -27,6 +28,7 @@ RUN apt-get update && \
libaio-dev \ libaio-dev \
libboost-program-options-dev \ libboost-program-options-dev \
libcap2 \ libcap2 \
libcurl4-openssl-dev \
libnuma-dev \ libnuma-dev \
libpci-dev \ libpci-dev \
libssl-dev \ libssl-dev \
...@@ -38,6 +40,7 @@ RUN apt-get update && \ ...@@ -38,6 +40,7 @@ RUN apt-get update && \
openssh-client \ openssh-client \
openssh-server \ openssh-server \
pciutils \ pciutils \
python3-mpi4py \
rsync \ rsync \
sudo \ sudo \
util-linux \ util-linux \
...@@ -46,11 +49,11 @@ RUN apt-get update && \ ...@@ -46,11 +49,11 @@ RUN apt-get update && \
&& \ && \
rm -rf /tmp/* rm -rf /tmp/*
ARG NUM_MAKE_JOBS=16 ARG NUM_MAKE_JOBS=
# Check if CMake is installed and its version # Check if CMake is installed and its version
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
required_version="3.26.4" && \ required_version="3.24.1" && \
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
echo "existing cmake version is ${cmake_version}" && \ echo "existing cmake version is ${cmake_version}" && \
cd /tmp && \ cd /tmp && \
...@@ -100,21 +103,9 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \ ...@@ -100,21 +103,9 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \ rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi fi
# Install UCX # Add target file to help determine which device(s) to build for
ENV UCX_VERSION=1.14.1 ENV ROCM_PATH=/opt/rocm
RUN if [ -z "$(ls -A /opt/ucx)" ]; then \ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst'
echo "/opt/ucx is empty. Installing UCX..."; \
cd /tmp && \
git clone https://github.com/openucx/ucx.git -b v${UCX_VERSION} && \
cd ucx && \
./autogen.sh && \
mkdir build && \
cd build && \
../configure -prefix=$UCX_DIR --with-rocm=/opt/rocm --without-knem && \
make -j $(nproc) && make -j $(nproc) install && rm -rf /tmp/ucx-${UCX_VERSION} ; \
else \
echo "/opt/ucx is not empty. Skipping UCX installation."; \
fi
# Install OpenMPI # Install OpenMPI
ENV OPENMPI_VERSION=4.1.x ENV OPENMPI_VERSION=4.1.x
...@@ -127,7 +118,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \ ...@@ -127,7 +118,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \
./autogen.pl && \ ./autogen.pl && \
mkdir build && \ mkdir build && \
cd build && \ cd build && \
../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --enable-mca-no-build=btl-uct --with-ucx=/opt/ucx --with-rocm=/opt/rocm && \ ../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
make -j $(nproc) && \ make -j $(nproc) && \
make -j $(nproc) install && \ make -j $(nproc) install && \
ldconfig && \ ldconfig && \
...@@ -148,12 +139,14 @@ RUN cd /opt/ && \ ...@@ -148,12 +139,14 @@ RUN cd /opt/ && \
cd rccl && \ cd rccl && \
mkdir build && \ mkdir build && \
cd build && \ cd build && \
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && \ CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \
-DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \
.. && \
make -j${NUM_MAKE_JOBS} make -j${NUM_MAKE_JOBS}
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/opt/ucx/lib:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \ SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \
...@@ -163,13 +156,17 @@ RUN echo PATH="$PATH" > /etc/environment && \ ...@@ -163,13 +156,17 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel setuptools==65.7
WORKDIR ${SB_HOME} WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
ADD . . ADD . .
RUN apt install rocm-cmake -y && \ #ENV USE_HIPBLASLT_DATATYPE=1
python3 -m pip install --upgrade pip wheel setuptools==65.7 && \ ENV CXX=/opt/rocm/bin/hipcc
python3 -m pip install .[amdworker] && \ RUN python3 -m pip install .[amdworker] && \
make cppbuild && \
make postinstall make postinstall
RUN make cppbuild
ADD third_party third_party
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
...@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") ...@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if(EXISTS ${HIP_PATH}) if(EXISTS ${HIP_PATH})
# Search for hip in common locations # Search for hip in common locations
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH}) list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/)
set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH)
set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc") set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
......
...@@ -100,7 +100,7 @@ ...@@ -100,7 +100,7 @@
docker run -itd --name={{ container }} \ docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \ --privileged --net=host --ipc=host \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \ {{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \ {{ '--security-opt seccomp=unconfined --group-add video --device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \ -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-v /var/run/docker.sock:/var/run/docker.sock \ -v /var/run/docker.sock:/var/run/docker.sock \
--entrypoint /bin/bash {{ docker_image }} && \ --entrypoint /bin/bash {{ docker_image }} && \
......
...@@ -12,13 +12,13 @@ CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c ...@@ -12,13 +12,13 @@ CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
# Build all targets. # Build all targets.
all: cuda rocm all: cuda rocm
cuda_with_msccl: cuda cuda_msccl cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
cpu: common cpu_perftest cpu: common cpu_perftest
common: cpu_hpl cpu_stream fio common: cpu_hpl cpu_stream fio
directx_amd: directx_amf_encoding_latency directx_amd: directx_amf_encoding_latency
...@@ -86,11 +86,11 @@ ifneq (,$(wildcard fio/Makefile)) ...@@ -86,11 +86,11 @@ ifneq (,$(wildcard fio/Makefile))
cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install
endif endif
# Build rccl-tests from commit 2a18737 of default branch. # Build rccl-tests from commit 46375b1 of default branch.
rocm_rccl_tests: sb_micro_path rocm_rccl_tests: sb_micro_path
ifneq (, $(wildcard rccl-tests/Makefile)) ifneq (, $(wildcard rccl-tests/Makefile))
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) HIP_HOME=$(HIP_HOME) RCCL_HOME=$(RCCL_HOME) -j cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
cp -v ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/ cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif endif
# Build rocblas-bench. # Build rocblas-bench.
...@@ -192,6 +192,26 @@ megatron_deepspeed: ...@@ -192,6 +192,26 @@ megatron_deepspeed:
python -m pip install -r requirements.txt && \ python -m pip install -r requirements.txt && \
python -m pip install DeepSpeed python -m pip install DeepSpeed
# Instal apex of ROCm due to dependency of Megatron
apex_rocm:
$(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))
$(eval TORCH_MAJOR_VERSION ?= $(word 1,$(subst ., ,$(TORCH_VERSION))))
$(eval TORCH_MINOR_VERSION ?= $(word 2,$(subst ., ,$(TORCH_VERSION))))
if [ ! -d "apex" ]; then \
git clone https://github.com/ROCmSoftwarePlatform/apex.git ; \
fi
cd apex && \
if [ "$$(expr $(TORCH_MAJOR_VERSION) \> 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) \> 1)" -eq 1 ]; then \
git checkout master ; \
elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 1)" -eq 1 ]; then \
git checkout release/1.1.0 ; \
elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \
git checkout release/1.0.0 ; \
elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \
git checkout release/1.0.0 ; \
fi
pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex
# Build MSCCL for CUDA # Build MSCCL for CUDA
cuda_msccl: sb_micro_path cuda_msccl: sb_micro_path
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile)) ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
......
Subproject commit 2a18737dc681e03ce82c046caa71b28db65017b5 Subproject commit 46375b1c527b2e3afe80fdd6dd136151bd939675
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment