Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
b85f6851
Unverified
Commit
b85f6851
authored
Dec 21, 2023
by
Yuting Jiang
Committed by
GitHub
Dec 21, 2023
Browse files
Dockerfile - Bug fix for rocm docker build and deploy (#598)
**Description** Bug fix for rocm docker build and deploy.
parent
32ed692e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
52 additions
and
36 deletions
+52
-36
.github/workflows/build-image.yml
.github/workflows/build-image.yml
+1
-1
dockerfile/rocm5.7.x.dockerfile
dockerfile/rocm5.7.x.dockerfile
+23
-26
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
+1
-2
superbench/runner/playbooks/deploy.yaml
superbench/runner/playbooks/deploy.yaml
+1
-1
third_party/Makefile
third_party/Makefile
+25
-5
third_party/rccl-tests
third_party/rccl-tests
+1
-1
No files found.
.github/workflows/build-image.yml
View file @
b85f6851
...
...
@@ -108,7 +108,7 @@ jobs:
username
:
${{ secrets.DOCKERHUB_USERNAME }}
password
:
${{ secrets.DOCKERHUB_TOKEN }}
-
name
:
Pull cache image
run
:
sudo docker pull ${{ steps.metadata.outputs.tags }}
run
:
sudo docker pull
$(cut -d, -f1 <<<
${{ steps.metadata.outputs.tags }}
)
continue-on-error
:
true
-
name
:
Login to the GitHub Container Registry
uses
:
docker/login-action@v1
...
...
dockerfile/rocm5.7.x.dockerfile
View file @
b85f6851
...
...
@@ -17,6 +17,7 @@ RUN apt-get update && \
apt-get
-q
install
-y
--no-install-recommends
\
autoconf
\
automake
\
bc
\
build-essential
\
curl
\
dmidecode
\
...
...
@@ -27,6 +28,7 @@ RUN apt-get update && \
libaio-dev
\
libboost-program-options-dev
\
libcap2
\
libcurl4-openssl-dev
\
libnuma-dev
\
libpci-dev
\
libssl-dev
\
...
...
@@ -38,6 +40,7 @@ RUN apt-get update && \
openssh-client
\
openssh-server
\
pciutils
\
python3-mpi4py
\
rsync
\
sudo
\
util-linux
\
...
...
@@ -46,11 +49,11 @@ RUN apt-get update && \
&&
\
rm
-rf
/tmp/
*
ARG
NUM_MAKE_JOBS=
16
ARG
NUM_MAKE_JOBS=
# Check if CMake is installed and its version
RUN
cmake_version
=
$(
cmake
--version
2>/dev/null |
grep
-oP
"(?<=cmake version )(
\d
+
\.\d
+)"
||
echo
"0.0"
)
&&
\
required_version
=
"3.2
6.4
"
&&
\
required_version
=
"3.2
4.1
"
&&
\
if
[
"
$(
printf
"%s
\n
"
"
$required_version
"
"
$cmake_version
"
|
sort
-V
|
head
-n
1
)
"
!=
"
$required_version
"
]
;
then
\
echo
"existing cmake version is
${
cmake_version
}
"
&&
\
cd
/tmp
&&
\
...
...
@@ -100,21 +103,9 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \
rm
-rf
MLNX_OFED_LINUX-
${
OFED_VERSION
}*
;
\
fi
# Install UCX
ENV
UCX_VERSION=1.14.1
RUN if
[
-z
"
$(
ls
-A
/opt/ucx
)
"
]
;
then
\
echo
"/opt/ucx is empty. Installing UCX..."
;
\
cd
/tmp
&&
\
git clone https://github.com/openucx/ucx.git
-b
v
${
UCX_VERSION
}
&&
\
cd
ucx
&&
\
./autogen.sh
&&
\
mkdir
build
&&
\
cd
build
&&
\
../configure
-prefix
=
$UCX_DIR
--with-rocm
=
/opt/rocm
--without-knem
&&
\
make
-j
$(
nproc
)
&&
make
-j
$(
nproc
)
install
&&
rm
-rf
/tmp/ucx-
${
UCX_VERSION
}
;
\
else
\
echo
"/opt/ucx is not empty. Skipping UCX installation."
;
\
fi
# Add target file to help determine which device(s) to build for
ENV
ROCM_PATH=/opt/rocm
RUN
bash
-c
'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst'
# Install OpenMPI
ENV
OPENMPI_VERSION=4.1.x
...
...
@@ -127,7 +118,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \
./autogen.pl
&&
\
mkdir
build
&&
\
cd
build
&&
\
../configure
--prefix
=
/usr/local
--enable-orterun-prefix-by-default
--enable-mpirun-prefix-by-default
--enable-prte-prefix-by-default
--enable-mca-no-build
=
btl-uct
--with-ucx
=
/opt/ucx
--with-rocm
=
/opt/rocm
&&
\
../configure
--prefix
=
/usr/local
--enable-orterun-prefix-by-default
--enable-mpirun-prefix-by-default
--enable-prte-prefix-by-default
--with-rocm
=
/opt/rocm
&&
\
make
-j
$(
nproc
)
&&
\
make
-j
$(
nproc
)
install
&&
\
ldconfig
&&
\
...
...
@@ -148,12 +139,14 @@ RUN cd /opt/ && \
cd
rccl
&&
\
mkdir
build
&&
\
cd
build
&&
\
CXX
=
/opt/rocm/bin/hipcc cmake
-DCMAKE_PREFIX_PATH
=
/opt/rocm/ ..
&&
\
CXX
=
/opt/rocm/bin/hipcc cmake
-DHIP_COMPILER
=
clang
-DCMAKE_BUILD_TYPE
=
Release
-DCMAKE_VERBOSE_MAKEFILE
=
1
\
-DCMAKE_PREFIX_PATH
=
"
${
ROCM_PATH
}
/hsa;
${
ROCM_PATH
}
/hip;
${
ROCM_PATH
}
/share/rocm/cmake/;
${
ROCM_PATH
}
"
\
..
&&
\
make
-j
${
NUM_MAKE_JOBS
}
ENV
PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/
opt/ucx/lib:/
usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
...
...
@@ -163,13 +156,17 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo
LD_LIBRARY_PATH
=
"
$LD_LIBRARY_PATH
"
>>
/etc/environment
&&
\
echo
SB_MICRO_PATH
=
"
$SB_MICRO_PATH
"
>>
/etc/environment
RUN
apt
install
rocm-cmake
-y
&&
\
python3
-m
pip
install
--upgrade
pip wheel
setuptools
==
65.7
WORKDIR
${SB_HOME}
ADD
third_party third_party
RUN
make
RCCL_HOME
=
/opt/rccl/build/
MPI_HOME
=
/usr/local
ROCBLAS_BRANCH
=
release/rocm-rel-5.7.1.1
HIPBLASLT_BRANCH
=
release-staging/rocm-rel-5.7
ROCM_VER
=
rocm-5.5.0
-C
third_party rocm
-o
cpu_hpl
-o
cpu_stream
-o
megatron_lm
ADD
. .
RUN
apt
install
rocm-cmake
-y
&&
\
python3
-m
pip
install
--upgrade
pip wheel
setuptools
==
65.7
&&
\
python3
-m
pip
install
.[amdworker]
&&
\
#ENV USE_HIPBLASLT_DATATYPE=1
ENV
CXX=/opt/rocm/bin/hipcc
RUN
python3
-m
pip
install
.[amdworker]
&&
\
make cppbuild
&&
\
make postinstall
RUN
make cppbuild
ADD
third_party third_party
RUN
make
RCCL_HOME
=
/opt/rccl/build/
ROCBLAS_BRANCH
=
release/rocm-rel-5.7.1.1
HIPBLASLT_BRANCH
=
release-staging/rocm-rel-5.7
ROCM_VER
=
rocm-5.5.0
-C
third_party rocm
-o
cpu_hpl
-o
cpu_stream
-o
megatron_lm
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
View file @
b85f6851
...
...
@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if
(
EXISTS
${
HIP_PATH
}
)
# Search for hip in common locations
list
(
APPEND CMAKE_PREFIX_PATH
${
HIP_PATH
}
${
ROCM_PATH
}
)
set
(
CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH
)
list
(
APPEND CMAKE_PREFIX_PATH
${
HIP_PATH
}
${
ROCM_PATH
}
${
ROCM_PATH
}
/hsa
${
ROCM_PATH
}
/hip
${
ROCM_PATH
}
/share/rocm/cmake/
)
set
(
CMAKE_CXX_COMPILER
"
${
HIP_PATH
}
/bin/hipcc"
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/cmake"
${
CMAKE_MODULE_PATH
}
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/lib/cmake/hip"
${
CMAKE_MODULE_PATH
}
)
...
...
superbench/runner/playbooks/deploy.yaml
View file @
b85f6851
...
...
@@ -100,7 +100,7 @@
docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video
--device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G
' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-v /var/run/docker.sock:/var/run/docker.sock \
--entrypoint /bin/bash {{ docker_image }} && \
...
...
third_party/Makefile
View file @
b85f6851
...
...
@@ -12,13 +12,13 @@ CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c
ROCBLAS_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
HIPBLASLT_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
.PHONY
:
all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
.PHONY
:
all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
apex_rocm
# Build all targets.
all
:
cuda rocm
cuda_with_msccl
:
cuda cuda_msccl
cuda
:
common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm
:
common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
rocm
:
common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
apex_rocm
cpu
:
common cpu_perftest
common
:
cpu_hpl cpu_stream fio
directx_amd
:
directx_amf_encoding_latency
...
...
@@ -86,11 +86,11 @@ ifneq (,$(wildcard fio/Makefile))
cd
./fio
&&
./configure
--prefix
=
$(SB_MICRO_PATH)
--disable-native
&&
make
-j
&&
make
install
endif
# Build rccl-tests from commit
2a18737
of default branch.
# Build rccl-tests from commit
46375b1
of default branch.
rocm_rccl_tests
:
sb_micro_path
ifneq
(, $(wildcard rccl-tests/Makefile))
cd
./rccl-tests
&&
make
MPI
=
1
MPI_HOME
=
$(MPI_HOME)
HIP_HOME
=
$(HIP_HOME)
RCCL_HOME
=
$(RCCL_HOME)
-j
cp
-v
./rccl-tests/build/
*
$(SB_MICRO_PATH)
/bin/
cd
./rccl-tests
&&
make
MPI
=
1
MPI_HOME
=
$(MPI_HOME)
-j
cp
-v
-r
./rccl-tests/build/
*
$(SB_MICRO_PATH)
/bin/
endif
# Build rocblas-bench.
...
...
@@ -192,6 +192,26 @@ megatron_deepspeed:
python
-m
pip
install
-r
requirements.txt
&&
\
python
-m
pip
install
DeepSpeed
# Instal apex of ROCm due to dependency of Megatron
apex_rocm
:
$(
eval
TORCH_VERSION ?
=
$(
shell
python
-c
"import torch; print(torch.__version__
)
"
))
$(
eval
TORCH_MAJOR_VERSION ?
=
$(
word
1,
$(
subst
., ,
$(TORCH_VERSION)
)))
$(
eval
TORCH_MINOR_VERSION ?
=
$(
word
2,
$(
subst
., ,
$(TORCH_VERSION)
)))
if
[
!
-d
"apex"
]
;
then
\
git clone https://github.com/ROCmSoftwarePlatform/apex.git
;
\
fi
cd
apex
&&
\
if
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
\>
2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
\>
1)"
-eq
1
]
;
then
\
git checkout master
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
== 1)"
-eq
1
]
;
then
\
git checkout release/1.1.0
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
== 0)"
-eq
1
]
;
then
\
git checkout release/1.0.0
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 1)"
-eq
1
]
;
then
\
git checkout release/1.0.0
;
\
fi
pip
install
-v
--disable-pip-version-check
--no-build-isolation
--config-settings
"--build-option=--cpp_ext"
--config-settings
"--build-option=--cuda_ext"
./apex
# Build MSCCL for CUDA
cuda_msccl
:
sb_micro_path
ifneq
(,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
...
...
rccl-tests
@
46375b1c
Compare
2a18737d
...
46375b1c
Subproject commit
2a18737dc681e03ce82c046caa71b28db65017b
5
Subproject commit
46375b1c527b2e3afe80fdd6dd136151bd93967
5
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment