Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
b85f6851
Unverified
Commit
b85f6851
authored
Dec 21, 2023
by
Yuting Jiang
Committed by
GitHub
Dec 21, 2023
Browse files
Dockerfile - Bug fix for rocm docker build and deploy (#598)
**Description** Bug fix for rocm docker build and deploy.
parent
32ed692e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
52 additions
and
36 deletions
+52
-36
.github/workflows/build-image.yml
.github/workflows/build-image.yml
+1
-1
dockerfile/rocm5.7.x.dockerfile
dockerfile/rocm5.7.x.dockerfile
+23
-26
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
+1
-2
superbench/runner/playbooks/deploy.yaml
superbench/runner/playbooks/deploy.yaml
+1
-1
third_party/Makefile
third_party/Makefile
+25
-5
third_party/rccl-tests
third_party/rccl-tests
+1
-1
No files found.
.github/workflows/build-image.yml
View file @
b85f6851
...
...
@@ -108,7 +108,7 @@ jobs:
username
:
${{ secrets.DOCKERHUB_USERNAME }}
password
:
${{ secrets.DOCKERHUB_TOKEN }}
-
name
:
Pull cache image
run
:
sudo docker pull ${{ steps.metadata.outputs.tags }}
run
:
sudo docker pull
$(cut -d, -f1 <<<
${{ steps.metadata.outputs.tags }}
)
continue-on-error
:
true
-
name
:
Login to the GitHub Container Registry
uses
:
docker/login-action@v1
...
...
dockerfile/rocm5.7.x.dockerfile
View file @
b85f6851
...
...
@@ -17,6 +17,7 @@ RUN apt-get update && \
apt-get
-q
install
-y
--no-install-recommends
\
autoconf
\
automake
\
bc
\
build-essential
\
curl
\
dmidecode
\
...
...
@@ -27,6 +28,7 @@ RUN apt-get update && \
libaio-dev
\
libboost-program-options-dev
\
libcap2
\
libcurl4-openssl-dev
\
libnuma-dev
\
libpci-dev
\
libssl-dev
\
...
...
@@ -38,6 +40,7 @@ RUN apt-get update && \
openssh-client
\
openssh-server
\
pciutils
\
python3-mpi4py
\
rsync
\
sudo
\
util-linux
\
...
...
@@ -46,11 +49,11 @@ RUN apt-get update && \
&&
\
rm
-rf
/tmp/
*
ARG
NUM_MAKE_JOBS=
16
ARG
NUM_MAKE_JOBS=
# Check if CMake is installed and its version
RUN
cmake_version
=
$(
cmake
--version
2>/dev/null |
grep
-oP
"(?<=cmake version )(
\d
+
\.\d
+)"
||
echo
"0.0"
)
&&
\
required_version
=
"3.2
6.4
"
&&
\
required_version
=
"3.2
4.1
"
&&
\
if
[
"
$(
printf
"%s
\n
"
"
$required_version
"
"
$cmake_version
"
|
sort
-V
|
head
-n
1
)
"
!=
"
$required_version
"
]
;
then
\
echo
"existing cmake version is
${
cmake_version
}
"
&&
\
cd
/tmp
&&
\
...
...
@@ -100,21 +103,9 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \
rm
-rf
MLNX_OFED_LINUX-
${
OFED_VERSION
}*
;
\
fi
# Install UCX
ENV
UCX_VERSION=1.14.1
RUN if
[
-z
"
$(
ls
-A
/opt/ucx
)
"
]
;
then
\
echo
"/opt/ucx is empty. Installing UCX..."
;
\
cd
/tmp
&&
\
git clone https://github.com/openucx/ucx.git
-b
v
${
UCX_VERSION
}
&&
\
cd
ucx
&&
\
./autogen.sh
&&
\
mkdir
build
&&
\
cd
build
&&
\
../configure
-prefix
=
$UCX_DIR
--with-rocm
=
/opt/rocm
--without-knem
&&
\
make
-j
$(
nproc
)
&&
make
-j
$(
nproc
)
install
&&
rm
-rf
/tmp/ucx-
${
UCX_VERSION
}
;
\
else
\
echo
"/opt/ucx is not empty. Skipping UCX installation."
;
\
fi
# Add target file to help determine which device(s) to build for
ENV
ROCM_PATH=/opt/rocm
RUN
bash
-c
'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst'
# Install OpenMPI
ENV
OPENMPI_VERSION=4.1.x
...
...
@@ -127,7 +118,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \
./autogen.pl
&&
\
mkdir
build
&&
\
cd
build
&&
\
../configure
--prefix
=
/usr/local
--enable-orterun-prefix-by-default
--enable-mpirun-prefix-by-default
--enable-prte-prefix-by-default
--enable-mca-no-build
=
btl-uct
--with-ucx
=
/opt/ucx
--with-rocm
=
/opt/rocm
&&
\
../configure
--prefix
=
/usr/local
--enable-orterun-prefix-by-default
--enable-mpirun-prefix-by-default
--enable-prte-prefix-by-default
--with-rocm
=
/opt/rocm
&&
\
make
-j
$(
nproc
)
&&
\
make
-j
$(
nproc
)
install
&&
\
ldconfig
&&
\
...
...
@@ -148,12 +139,14 @@ RUN cd /opt/ && \
cd
rccl
&&
\
mkdir
build
&&
\
cd
build
&&
\
CXX
=
/opt/rocm/bin/hipcc cmake
-DCMAKE_PREFIX_PATH
=
/opt/rocm/ ..
&&
\
CXX
=
/opt/rocm/bin/hipcc cmake
-DHIP_COMPILER
=
clang
-DCMAKE_BUILD_TYPE
=
Release
-DCMAKE_VERBOSE_MAKEFILE
=
1
\
-DCMAKE_PREFIX_PATH
=
"
${
ROCM_PATH
}
/hsa;
${
ROCM_PATH
}
/hip;
${
ROCM_PATH
}
/share/rocm/cmake/;
${
ROCM_PATH
}
"
\
..
&&
\
make
-j
${
NUM_MAKE_JOBS
}
ENV
PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/
opt/ucx/lib:/
usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
...
...
@@ -163,13 +156,17 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo
LD_LIBRARY_PATH
=
"
$LD_LIBRARY_PATH
"
>>
/etc/environment
&&
\
echo
SB_MICRO_PATH
=
"
$SB_MICRO_PATH
"
>>
/etc/environment
RUN
apt
install
rocm-cmake
-y
&&
\
python3
-m
pip
install
--upgrade
pip wheel
setuptools
==
65.7
WORKDIR
${SB_HOME}
ADD
third_party third_party
RUN
make
RCCL_HOME
=
/opt/rccl/build/
MPI_HOME
=
/usr/local
ROCBLAS_BRANCH
=
release/rocm-rel-5.7.1.1
HIPBLASLT_BRANCH
=
release-staging/rocm-rel-5.7
ROCM_VER
=
rocm-5.5.0
-C
third_party rocm
-o
cpu_hpl
-o
cpu_stream
-o
megatron_lm
ADD
. .
RUN
apt
install
rocm-cmake
-y
&&
\
python3
-m
pip
install
--upgrade
pip wheel
setuptools
==
65.7
&&
\
python3
-m
pip
install
.[amdworker]
&&
\
#ENV USE_HIPBLASLT_DATATYPE=1
ENV
CXX=/opt/rocm/bin/hipcc
RUN
python3
-m
pip
install
.[amdworker]
&&
\
make cppbuild
&&
\
make postinstall
RUN
make cppbuild
ADD
third_party third_party
RUN
make
RCCL_HOME
=
/opt/rccl/build/
ROCBLAS_BRANCH
=
release/rocm-rel-5.7.1.1
HIPBLASLT_BRANCH
=
release-staging/rocm-rel-5.7
ROCM_VER
=
rocm-5.5.0
-C
third_party rocm
-o
cpu_hpl
-o
cpu_stream
-o
megatron_lm
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
View file @
b85f6851
...
...
@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if
(
EXISTS
${
HIP_PATH
}
)
# Search for hip in common locations
list
(
APPEND CMAKE_PREFIX_PATH
${
HIP_PATH
}
${
ROCM_PATH
}
)
set
(
CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH
)
list
(
APPEND CMAKE_PREFIX_PATH
${
HIP_PATH
}
${
ROCM_PATH
}
${
ROCM_PATH
}
/hsa
${
ROCM_PATH
}
/hip
${
ROCM_PATH
}
/share/rocm/cmake/
)
set
(
CMAKE_CXX_COMPILER
"
${
HIP_PATH
}
/bin/hipcc"
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/cmake"
${
CMAKE_MODULE_PATH
}
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/lib/cmake/hip"
${
CMAKE_MODULE_PATH
}
)
...
...
superbench/runner/playbooks/deploy.yaml
View file @
b85f6851
...
...
@@ -100,7 +100,7 @@
docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video
--device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G
' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-v /var/run/docker.sock:/var/run/docker.sock \
--entrypoint /bin/bash {{ docker_image }} && \
...
...
third_party/Makefile
View file @
b85f6851
...
...
@@ -12,13 +12,13 @@ CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c
ROCBLAS_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
HIPBLASLT_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
.PHONY
:
all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
.PHONY
:
all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
apex_rocm
# Build all targets.
all
:
cuda rocm
cuda_with_msccl
:
cuda cuda_msccl
cuda
:
common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm
:
common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
rocm
:
common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
apex_rocm
cpu
:
common cpu_perftest
common
:
cpu_hpl cpu_stream fio
directx_amd
:
directx_amf_encoding_latency
...
...
@@ -86,11 +86,11 @@ ifneq (,$(wildcard fio/Makefile))
cd
./fio
&&
./configure
--prefix
=
$(SB_MICRO_PATH)
--disable-native
&&
make
-j
&&
make
install
endif
# Build rccl-tests from commit
2a18737
of default branch.
# Build rccl-tests from commit
46375b1
of default branch.
rocm_rccl_tests
:
sb_micro_path
ifneq
(, $(wildcard rccl-tests/Makefile))
cd
./rccl-tests
&&
make
MPI
=
1
MPI_HOME
=
$(MPI_HOME)
HIP_HOME
=
$(HIP_HOME)
RCCL_HOME
=
$(RCCL_HOME)
-j
cp
-v
./rccl-tests/build/
*
$(SB_MICRO_PATH)
/bin/
cd
./rccl-tests
&&
make
MPI
=
1
MPI_HOME
=
$(MPI_HOME)
-j
cp
-v
-r
./rccl-tests/build/
*
$(SB_MICRO_PATH)
/bin/
endif
# Build rocblas-bench.
...
...
@@ -192,6 +192,26 @@ megatron_deepspeed:
python
-m
pip
install
-r
requirements.txt
&&
\
python
-m
pip
install
DeepSpeed
# Instal apex of ROCm due to dependency of Megatron
apex_rocm
:
$(
eval
TORCH_VERSION ?
=
$(
shell
python
-c
"import torch; print(torch.__version__
)
"
))
$(
eval
TORCH_MAJOR_VERSION ?
=
$(
word
1,
$(
subst
., ,
$(TORCH_VERSION)
)))
$(
eval
TORCH_MINOR_VERSION ?
=
$(
word
2,
$(
subst
., ,
$(TORCH_VERSION)
)))
if
[
!
-d
"apex"
]
;
then
\
git clone https://github.com/ROCmSoftwarePlatform/apex.git
;
\
fi
cd
apex
&&
\
if
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
\>
2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
\>
1)"
-eq
1
]
;
then
\
git checkout master
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
== 1)"
-eq
1
]
;
then
\
git checkout release/1.1.0
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
== 0)"
-eq
1
]
;
then
\
git checkout release/1.0.0
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 1)"
-eq
1
]
;
then
\
git checkout release/1.0.0
;
\
fi
pip
install
-v
--disable-pip-version-check
--no-build-isolation
--config-settings
"--build-option=--cpp_ext"
--config-settings
"--build-option=--cuda_ext"
./apex
# Build MSCCL for CUDA
cuda_msccl
:
sb_micro_path
ifneq
(,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
...
...
rccl-tests
@
46375b1c
Compare
2a18737d
...
46375b1c
Subproject commit
2a18737dc681e03ce82c046caa71b28db65017b
5
Subproject commit
46375b1c527b2e3afe80fdd6dd136151bd93967
5
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment