Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
b85f6851
Unverified
Commit
b85f6851
authored
Dec 21, 2023
by
Yuting Jiang
Committed by
GitHub
Dec 21, 2023
Browse files
Dockerfile - Bug fix for rocm docker build and deploy (#598)
**Description** Bug fix for rocm docker build and deploy.
parent
32ed692e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
52 additions
and
36 deletions
+52
-36
.github/workflows/build-image.yml
.github/workflows/build-image.yml
+1
-1
dockerfile/rocm5.7.x.dockerfile
dockerfile/rocm5.7.x.dockerfile
+23
-26
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
+1
-2
superbench/runner/playbooks/deploy.yaml
superbench/runner/playbooks/deploy.yaml
+1
-1
third_party/Makefile
third_party/Makefile
+25
-5
third_party/rccl-tests
third_party/rccl-tests
+1
-1
No files found.
.github/workflows/build-image.yml
View file @
b85f6851
...
@@ -108,7 +108,7 @@ jobs:
...
@@ -108,7 +108,7 @@ jobs:
username
:
${{ secrets.DOCKERHUB_USERNAME }}
username
:
${{ secrets.DOCKERHUB_USERNAME }}
password
:
${{ secrets.DOCKERHUB_TOKEN }}
password
:
${{ secrets.DOCKERHUB_TOKEN }}
-
name
:
Pull cache image
-
name
:
Pull cache image
run
:
sudo docker pull ${{ steps.metadata.outputs.tags }}
run
:
sudo docker pull
$(cut -d, -f1 <<<
${{ steps.metadata.outputs.tags }}
)
continue-on-error
:
true
continue-on-error
:
true
-
name
:
Login to the GitHub Container Registry
-
name
:
Login to the GitHub Container Registry
uses
:
docker/login-action@v1
uses
:
docker/login-action@v1
...
...
dockerfile/rocm5.7.x.dockerfile
View file @
b85f6851
...
@@ -17,6 +17,7 @@ RUN apt-get update && \
...
@@ -17,6 +17,7 @@ RUN apt-get update && \
apt-get
-q
install
-y
--no-install-recommends
\
apt-get
-q
install
-y
--no-install-recommends
\
autoconf
\
autoconf
\
automake
\
automake
\
bc
\
build-essential
\
build-essential
\
curl
\
curl
\
dmidecode
\
dmidecode
\
...
@@ -27,6 +28,7 @@ RUN apt-get update && \
...
@@ -27,6 +28,7 @@ RUN apt-get update && \
libaio-dev
\
libaio-dev
\
libboost-program-options-dev
\
libboost-program-options-dev
\
libcap2
\
libcap2
\
libcurl4-openssl-dev
\
libnuma-dev
\
libnuma-dev
\
libpci-dev
\
libpci-dev
\
libssl-dev
\
libssl-dev
\
...
@@ -38,6 +40,7 @@ RUN apt-get update && \
...
@@ -38,6 +40,7 @@ RUN apt-get update && \
openssh-client
\
openssh-client
\
openssh-server
\
openssh-server
\
pciutils
\
pciutils
\
python3-mpi4py
\
rsync
\
rsync
\
sudo
\
sudo
\
util-linux
\
util-linux
\
...
@@ -46,11 +49,11 @@ RUN apt-get update && \
...
@@ -46,11 +49,11 @@ RUN apt-get update && \
&&
\
&&
\
rm
-rf
/tmp/
*
rm
-rf
/tmp/
*
ARG
NUM_MAKE_JOBS=
16
ARG
NUM_MAKE_JOBS=
# Check if CMake is installed and its version
# Check if CMake is installed and its version
RUN
cmake_version
=
$(
cmake
--version
2>/dev/null |
grep
-oP
"(?<=cmake version )(
\d
+
\.\d
+)"
||
echo
"0.0"
)
&&
\
RUN
cmake_version
=
$(
cmake
--version
2>/dev/null |
grep
-oP
"(?<=cmake version )(
\d
+
\.\d
+)"
||
echo
"0.0"
)
&&
\
required_version
=
"3.2
6.4
"
&&
\
required_version
=
"3.2
4.1
"
&&
\
if
[
"
$(
printf
"%s
\n
"
"
$required_version
"
"
$cmake_version
"
|
sort
-V
|
head
-n
1
)
"
!=
"
$required_version
"
]
;
then
\
if
[
"
$(
printf
"%s
\n
"
"
$required_version
"
"
$cmake_version
"
|
sort
-V
|
head
-n
1
)
"
!=
"
$required_version
"
]
;
then
\
echo
"existing cmake version is
${
cmake_version
}
"
&&
\
echo
"existing cmake version is
${
cmake_version
}
"
&&
\
cd
/tmp
&&
\
cd
/tmp
&&
\
...
@@ -100,21 +103,9 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \
...
@@ -100,21 +103,9 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \
rm
-rf
MLNX_OFED_LINUX-
${
OFED_VERSION
}*
;
\
rm
-rf
MLNX_OFED_LINUX-
${
OFED_VERSION
}*
;
\
fi
fi
# Install UCX
# Add target file to help determine which device(s) to build for
ENV
UCX_VERSION=1.14.1
ENV
ROCM_PATH=/opt/rocm
RUN if
[
-z
"
$(
ls
-A
/opt/ucx
)
"
]
;
then
\
RUN
bash
-c
'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst'
echo
"/opt/ucx is empty. Installing UCX..."
;
\
cd
/tmp
&&
\
git clone https://github.com/openucx/ucx.git
-b
v
${
UCX_VERSION
}
&&
\
cd
ucx
&&
\
./autogen.sh
&&
\
mkdir
build
&&
\
cd
build
&&
\
../configure
-prefix
=
$UCX_DIR
--with-rocm
=
/opt/rocm
--without-knem
&&
\
make
-j
$(
nproc
)
&&
make
-j
$(
nproc
)
install
&&
rm
-rf
/tmp/ucx-
${
UCX_VERSION
}
;
\
else
\
echo
"/opt/ucx is not empty. Skipping UCX installation."
;
\
fi
# Install OpenMPI
# Install OpenMPI
ENV
OPENMPI_VERSION=4.1.x
ENV
OPENMPI_VERSION=4.1.x
...
@@ -127,7 +118,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \
...
@@ -127,7 +118,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \
./autogen.pl
&&
\
./autogen.pl
&&
\
mkdir
build
&&
\
mkdir
build
&&
\
cd
build
&&
\
cd
build
&&
\
../configure
--prefix
=
/usr/local
--enable-orterun-prefix-by-default
--enable-mpirun-prefix-by-default
--enable-prte-prefix-by-default
--enable-mca-no-build
=
btl-uct
--with-ucx
=
/opt/ucx
--with-rocm
=
/opt/rocm
&&
\
../configure
--prefix
=
/usr/local
--enable-orterun-prefix-by-default
--enable-mpirun-prefix-by-default
--enable-prte-prefix-by-default
--with-rocm
=
/opt/rocm
&&
\
make
-j
$(
nproc
)
&&
\
make
-j
$(
nproc
)
&&
\
make
-j
$(
nproc
)
install
&&
\
make
-j
$(
nproc
)
install
&&
\
ldconfig
&&
\
ldconfig
&&
\
...
@@ -148,12 +139,14 @@ RUN cd /opt/ && \
...
@@ -148,12 +139,14 @@ RUN cd /opt/ && \
cd
rccl
&&
\
cd
rccl
&&
\
mkdir
build
&&
\
mkdir
build
&&
\
cd
build
&&
\
cd
build
&&
\
CXX
=
/opt/rocm/bin/hipcc cmake
-DCMAKE_PREFIX_PATH
=
/opt/rocm/ ..
&&
\
CXX
=
/opt/rocm/bin/hipcc cmake
-DHIP_COMPILER
=
clang
-DCMAKE_BUILD_TYPE
=
Release
-DCMAKE_VERBOSE_MAKEFILE
=
1
\
-DCMAKE_PREFIX_PATH
=
"
${
ROCM_PATH
}
/hsa;
${
ROCM_PATH
}
/hip;
${
ROCM_PATH
}
/share/rocm/cmake/;
${
ROCM_PATH
}
"
\
..
&&
\
make
-j
${
NUM_MAKE_JOBS
}
make
-j
${
NUM_MAKE_JOBS
}
ENV
PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
ENV
PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/
opt/ucx/lib:/
usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
...
@@ -163,13 +156,17 @@ RUN echo PATH="$PATH" > /etc/environment && \
...
@@ -163,13 +156,17 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo
LD_LIBRARY_PATH
=
"
$LD_LIBRARY_PATH
"
>>
/etc/environment
&&
\
echo
LD_LIBRARY_PATH
=
"
$LD_LIBRARY_PATH
"
>>
/etc/environment
&&
\
echo
SB_MICRO_PATH
=
"
$SB_MICRO_PATH
"
>>
/etc/environment
echo
SB_MICRO_PATH
=
"
$SB_MICRO_PATH
"
>>
/etc/environment
RUN
apt
install
rocm-cmake
-y
&&
\
python3
-m
pip
install
--upgrade
pip wheel
setuptools
==
65.7
WORKDIR
${SB_HOME}
WORKDIR
${SB_HOME}
ADD
third_party third_party
RUN
make
RCCL_HOME
=
/opt/rccl/build/
MPI_HOME
=
/usr/local
ROCBLAS_BRANCH
=
release/rocm-rel-5.7.1.1
HIPBLASLT_BRANCH
=
release-staging/rocm-rel-5.7
ROCM_VER
=
rocm-5.5.0
-C
third_party rocm
-o
cpu_hpl
-o
cpu_stream
-o
megatron_lm
ADD
. .
ADD
. .
RUN
apt
install
rocm-cmake
-y
&&
\
#ENV USE_HIPBLASLT_DATATYPE=1
python3
-m
pip
install
--upgrade
pip wheel
setuptools
==
65.7
&&
\
ENV
CXX=/opt/rocm/bin/hipcc
python3
-m
pip
install
.[amdworker]
&&
\
RUN
python3
-m
pip
install
.[amdworker]
&&
\
make cppbuild
&&
\
make postinstall
make postinstall
RUN
make cppbuild
ADD
third_party third_party
RUN
make
RCCL_HOME
=
/opt/rccl/build/
ROCBLAS_BRANCH
=
release/rocm-rel-5.7.1.1
HIPBLASLT_BRANCH
=
release-staging/rocm-rel-5.7
ROCM_VER
=
rocm-5.5.0
-C
third_party rocm
-o
cpu_hpl
-o
cpu_stream
-o
megatron_lm
superbench/benchmarks/micro_benchmarks/rocm_common.cmake
View file @
b85f6851
...
@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
...
@@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if
(
EXISTS
${
HIP_PATH
}
)
if
(
EXISTS
${
HIP_PATH
}
)
# Search for hip in common locations
# Search for hip in common locations
list
(
APPEND CMAKE_PREFIX_PATH
${
HIP_PATH
}
${
ROCM_PATH
}
)
list
(
APPEND CMAKE_PREFIX_PATH
${
HIP_PATH
}
${
ROCM_PATH
}
${
ROCM_PATH
}
/hsa
${
ROCM_PATH
}
/hip
${
ROCM_PATH
}
/share/rocm/cmake/
)
set
(
CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH
)
set
(
CMAKE_CXX_COMPILER
"
${
HIP_PATH
}
/bin/hipcc"
)
set
(
CMAKE_CXX_COMPILER
"
${
HIP_PATH
}
/bin/hipcc"
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/cmake"
${
CMAKE_MODULE_PATH
}
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/cmake"
${
CMAKE_MODULE_PATH
}
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/lib/cmake/hip"
${
CMAKE_MODULE_PATH
}
)
set
(
CMAKE_MODULE_PATH
"
${
HIP_PATH
}
/lib/cmake/hip"
${
CMAKE_MODULE_PATH
}
)
...
...
superbench/runner/playbooks/deploy.yaml
View file @
b85f6851
...
@@ -100,7 +100,7 @@
...
@@ -100,7 +100,7 @@
docker run -itd --name={{ container }} \
docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \
--privileged --net=host --ipc=host \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--gpus=all' if nvidia_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video
--device=/dev/kfd --device=/dev/dri --cap-add=SYS_PTRACE --shm-size=16G
' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /var/run/docker.sock:/var/run/docker.sock \
--entrypoint /bin/bash {{ docker_image }} && \
--entrypoint /bin/bash {{ docker_image }} && \
...
...
third_party/Makefile
View file @
b85f6851
...
@@ -12,13 +12,13 @@ CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c
...
@@ -12,13 +12,13 @@ CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c
ROCBLAS_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
ROCBLAS_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
HIPBLASLT_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
HIPBLASLT_BRANCH
?=
rocm-
$(
shell
dpkg
-l
|
grep
'rocm-dev '
|
awk
'{print $$3
}
'
|
cut
-d
'.'
-f1-3
)
.PHONY
:
all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
.PHONY
:
all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
apex_rocm
# Build all targets.
# Build all targets.
all
:
cuda rocm
all
:
cuda rocm
cuda_with_msccl
:
cuda cuda_msccl
cuda_with_msccl
:
cuda cuda_msccl
cuda
:
common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
cuda
:
common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm
:
common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
rocm
:
common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
apex_rocm
cpu
:
common cpu_perftest
cpu
:
common cpu_perftest
common
:
cpu_hpl cpu_stream fio
common
:
cpu_hpl cpu_stream fio
directx_amd
:
directx_amf_encoding_latency
directx_amd
:
directx_amf_encoding_latency
...
@@ -86,11 +86,11 @@ ifneq (,$(wildcard fio/Makefile))
...
@@ -86,11 +86,11 @@ ifneq (,$(wildcard fio/Makefile))
cd
./fio
&&
./configure
--prefix
=
$(SB_MICRO_PATH)
--disable-native
&&
make
-j
&&
make
install
cd
./fio
&&
./configure
--prefix
=
$(SB_MICRO_PATH)
--disable-native
&&
make
-j
&&
make
install
endif
endif
# Build rccl-tests from commit
2a18737
of default branch.
# Build rccl-tests from commit
46375b1
of default branch.
rocm_rccl_tests
:
sb_micro_path
rocm_rccl_tests
:
sb_micro_path
ifneq
(, $(wildcard rccl-tests/Makefile))
ifneq
(, $(wildcard rccl-tests/Makefile))
cd
./rccl-tests
&&
make
MPI
=
1
MPI_HOME
=
$(MPI_HOME)
HIP_HOME
=
$(HIP_HOME)
RCCL_HOME
=
$(RCCL_HOME)
-j
cd
./rccl-tests
&&
make
MPI
=
1
MPI_HOME
=
$(MPI_HOME)
-j
cp
-v
./rccl-tests/build/
*
$(SB_MICRO_PATH)
/bin/
cp
-v
-r
./rccl-tests/build/
*
$(SB_MICRO_PATH)
/bin/
endif
endif
# Build rocblas-bench.
# Build rocblas-bench.
...
@@ -192,6 +192,26 @@ megatron_deepspeed:
...
@@ -192,6 +192,26 @@ megatron_deepspeed:
python
-m
pip
install
-r
requirements.txt
&&
\
python
-m
pip
install
-r
requirements.txt
&&
\
python
-m
pip
install
DeepSpeed
python
-m
pip
install
DeepSpeed
# Instal apex of ROCm due to dependency of Megatron
apex_rocm
:
$(
eval
TORCH_VERSION ?
=
$(
shell
python
-c
"import torch; print(torch.__version__
)
"
))
$(
eval
TORCH_MAJOR_VERSION ?
=
$(
word
1,
$(
subst
., ,
$(TORCH_VERSION)
)))
$(
eval
TORCH_MINOR_VERSION ?
=
$(
word
2,
$(
subst
., ,
$(TORCH_VERSION)
)))
if
[
!
-d
"apex"
]
;
then
\
git clone https://github.com/ROCmSoftwarePlatform/apex.git
;
\
fi
cd
apex
&&
\
if
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
\>
2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
\>
1)"
-eq
1
]
;
then
\
git checkout master
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
== 1)"
-eq
1
]
;
then
\
git checkout release/1.1.0
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 2)"
-eq
1
]
&&
[
"
$$
(expr
$(TORCH_MINOR_VERSION)
== 0)"
-eq
1
]
;
then
\
git checkout release/1.0.0
;
\
elif
[
"
$$
(expr
$(TORCH_MAJOR_VERSION)
== 1)"
-eq
1
]
;
then
\
git checkout release/1.0.0
;
\
fi
pip
install
-v
--disable-pip-version-check
--no-build-isolation
--config-settings
"--build-option=--cpp_ext"
--config-settings
"--build-option=--cuda_ext"
./apex
# Build MSCCL for CUDA
# Build MSCCL for CUDA
cuda_msccl
:
sb_micro_path
cuda_msccl
:
sb_micro_path
ifneq
(,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
ifneq
(,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
...
...
rccl-tests
@
46375b1c
Compare
2a18737d
...
46375b1c
Subproject commit
2a18737dc681e03ce82c046caa71b28db65017b
5
Subproject commit
46375b1c527b2e3afe80fdd6dd136151bd93967
5
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment