Unverified Commit 949f9cb4 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Release - SuperBench v0.11.0 (#654)



**Description**
Cherry pick bug fixes from v0.11.0 to main

**Major Revision**
* #645 
* #648 
* #646 
* #647 
* #651 
* #652 
* #650

---------
Co-authored-by: default avatarhongtaozhang <hongtaozhang@microsoft.com>
Co-authored-by: default avatarYifan Xiong <yifan.xiong@microsoft.com>
parent 9f3231e9
...@@ -40,16 +40,16 @@ jobs: ...@@ -40,16 +40,16 @@ jobs:
tags: superbench/main:cuda11.1.1,superbench/superbench:latest tags: superbench/main:cuda11.1.1,superbench/superbench:latest
runner: ubuntu-latest runner: ubuntu-latest
build_args: "NUM_MAKE_JOBS=8" build_args: "NUM_MAKE_JOBS=8"
- name: rocm5.7
dockerfile: rocm5.7.x
tags: superbench/main:rocm5.7
runner: [self-hosted, rocm-build]
build_args: "NUM_MAKE_JOBS=64"
- name: rocm6.0 - name: rocm6.0
dockerfile: rocm6.0.x dockerfile: rocm6.0.x
tags: superbench/main:rocm6.0 tags: superbench/main:rocm6.0
runner: [self-hosted, rocm-build] runner: [self-hosted, rocm-build]
build_args: "NUM_MAKE_JOBS=64" build_args: "NUM_MAKE_JOBS=16"
- name: rocm6.2
dockerfile: rocm6.2.x
tags: superbench/main:rocm6.2
runner: [self-hosted, rocm-build]
build_args: "NUM_MAKE_JOBS=16"
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v2 uses: actions/checkout@v2
...@@ -68,6 +68,8 @@ jobs: ...@@ -68,6 +68,8 @@ jobs:
else else
echo "No Docker images found with the specified references." echo "No Docker images found with the specified references."
fi fi
sudo docker ps -q | grep build | xargs -r sudo docker stop
echo y | sudo docker system prune -a --volumes
df -h df -h
- name: Prepare metadata - name: Prepare metadata
id: metadata id: metadata
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
__SuperBench__ is a validation and profiling tool for AI infrastructure. __SuperBench__ is a validation and profiling tool for AI infrastructure.
📢 [v0.10.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.10.0) has been released! 📢 [v0.11.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.11.0) has been released!
## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._ ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._
......
...@@ -83,11 +83,13 @@ RUN cd /tmp && \ ...@@ -83,11 +83,13 @@ RUN cd /tmp && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
# Install HPC-X # Install HPC-X
ENV HPCX_VERSION=v2.9.0
RUN cd /opt && \ RUN cd /opt && \
wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ rm -rf hpcx && \
tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-inbox-ubuntu20.04-x86_64.tbz -O hpcx.tbz && \
ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \ tar xf hpcx.tbz && \
rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz mv hpcx-${HPCX_VERSION}-gcc-inbox-ubuntu20.04-x86_64 hpcx && \
rm hpcx.tbz
# Install NCCL RDMA SHARP plugins # Install NCCL RDMA SHARP plugins
RUN cd /tmp && \ RUN cd /tmp && \
......
...@@ -8,7 +8,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 ...@@ -8,7 +8,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3
# - CUDA: 12.4.0 # - CUDA: 12.4.0
# - cuDNN: 9.0.0.306 # - cuDNN: 9.0.0.306
# - cuBLAS: 12.4.2.65 # - cuBLAS: 12.4.2.65
# - NCCL: v2.20 # - NCCL: v2.23.4-1
# - TransformerEngine 1.4 # - TransformerEngine 1.4
# Mellanox: # Mellanox:
# - OFED: 23.07-0.5.1.2 # - OFED: 23.07-0.5.1.2
...@@ -115,6 +115,23 @@ RUN cd /tmp && \ ...@@ -115,6 +115,23 @@ RUN cd /tmp && \
mv amd-blis /opt/AMD && \ mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz rm -rf aocl-blis-linux-aocc-4.0.tar.gz
# Install NCCL 2.23.4
RUN cd /tmp && \
git clone -b v2.23.4-1 https://github.com/NVIDIA/nccl.git && \
cd nccl && \
make -j ${NUM_MAKE_JOBS} src.build && \
make install && \
rm -rf /tmp/nccl
# Install UCX v1.16.0 with multi-threading support
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz && \
tar xzf ucx-1.16.0.tar.gz && \
cd ucx-1.16.0 && \
./contrib/configure-release-mt --prefix=/usr/local && \
make -j ${NUM_MAKE_JOBS} && \
make install
ENV PATH="${PATH}" \ ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \ SB_HOME=/opt/superbench \
......
...@@ -173,6 +173,11 @@ RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASL ...@@ -173,6 +173,11 @@ RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASL
RUN cd third_party/Megatron/Megatron-DeepSpeed && \ RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch git apply ../megatron_deepspeed_rocm6.patch
# Install AMD SMI Python Library
RUN apt install amd-smi-lib -y && \
cd /opt/rocm/share/amd_smi && \
python3 -m pip install .
ADD . . ADD . .
ENV USE_HIP_DATATYPE=1 ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1 ENV USE_HIPBLAS_COMPUTETYPE=1
......
ARG BASE_IMAGE=rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0
FROM ${BASE_IMAGE}
# OS:
# - Ubuntu: 22.04
# - Docker Client: 20.10.8
# ROCm:
# - ROCm: 6.2
# Lib:
# - torch: 2.3.0
# - rccl: 2.18.3+hip6.0 develop:7e1cbb4
# - hipblaslt: release-staging/rocm-rel-6.2
# - rocblas: release-staging/rocm-rel-6.2
# - openmpi: 4.1.x
# Intel:
# - mlc: v3.11
LABEL maintainer="SuperBench"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
python3-mpi4py \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*
ARG NUM_MAKE_JOBS=64
# Check if CMake is installed and its version
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
required_version="3.24.1" && \
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
echo "existing cmake version is ${cmake_version}" && \
cd /tmp && \
wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \
tar xzf cmake-${required_version}.tar.gz && \
cd cmake-${required_version} && \
./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \
make -j ${NUM_MAKE_JOBS} && \
make install && \
rm -rf /tmp/cmake-${required_version}* \
else \
echo "CMake version is greater than or equal to 3.24.1"; \
fi
# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz
# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
# Get Ubuntu version and set as an environment variable
RUN export UBUNTU_VERSION=$(lsb_release -r -s)
RUN echo "Ubuntu version: $UBUNTU_VERSION"
ENV UBUNTU_VERSION=${UBUNTU_VERSION}
# Install OFED
ENV OFED_VERSION=5.9-0.5.6.0
# Check if ofed_info is present and has a version
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi
ENV ROCM_PATH=/opt/rocm
# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
ENV MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed
RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
cd ompi && \
./autogen.pl && \
mkdir build && \
cd build && \
../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
make -j $(nproc) && \
make -j $(nproc) install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
# Install RCCL
RUN cd /opt/ && \
git clone -b release/rocm-rel-6.2 https://github.com/ROCmSoftwarePlatform/rccl.git && \
cd rccl && \
mkdir build && \
cd build && \
CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \
-DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \
.. && \
make -j${NUM_MAKE_JOBS}
# Install AMD SMI Python Library
RUN apt install amd-smi-lib -y && \
cd /opt/rocm/share/amd_smi && \
python3 -m pip install .
ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel setuptools==65.7
WORKDIR ${SB_HOME}
ADD third_party third_party
# Apply patch
RUN cd third_party/perftest && \
git apply ../perftest_rocm6.patch
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release-staging/rocm-rel-6.2 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.2 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
RUN cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/lib/* /opt/rocm/lib/ && \
cp -r /opt/superbench/third_party/hipBLASLt/build/release/hipblaslt-install/include/* /opt/rocm/include/
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch
# Install transformer_engine
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git && \
cd TransformerEngine && \
export NVTE_FRAMEWORK=pytorch && \
pip install .
ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install .[amdworker] && \
CXX=/opt/rocm/bin/hipcc make cppbuild && \
make postinstall
...@@ -61,7 +61,7 @@ You can clone the source from GitHub and build it. ...@@ -61,7 +61,7 @@ You can clone the source from GitHub and build it.
:::note Note :::note Note
You should checkout corresponding tag to use release version, for example, You should checkout corresponding tag to use release version, for example,
`git clone -b v0.10.0 https://github.com/microsoft/superbenchmark` `git clone -b v0.11.0 https://github.com/microsoft/superbenchmark`
::: :::
```bash ```bash
......
...@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] ...@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
:::note Note :::note Note
You should deploy corresponding Docker image to use release version, for example, You should deploy corresponding Docker image to use release version, for example,
`sb deploy -f local.ini -i superbench/superbench:v0.10.0-cuda12.2` `sb deploy -f local.ini -i superbench/superbench:v0.11.0-cuda12.4`
You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone. You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone.
......
...@@ -70,7 +70,7 @@ superbench: ...@@ -70,7 +70,7 @@ superbench:
<TabItem value='example'> <TabItem value='example'>
```yaml ```yaml
version: v0.10 version: v0.11
superbench: superbench:
enable: benchmark_1 enable: benchmark_1
monitor: monitor:
......
...@@ -30,6 +30,9 @@ available tags are listed below for all stable versions. ...@@ -30,6 +30,9 @@ available tags are listed below for all stable versions.
| Tag | Description | | Tag | Description |
|--------------------|-------------------------------------| |--------------------|-------------------------------------|
| v0.11.0-cuda12.4 | SuperBench v0.11.0 with CUDA 12.4 |
| v0.11.0-cuda12.2 | SuperBench v0.11.0 with CUDA 12.2 |
| v0.11.0-cuda11.1.1 | SuperBench v0.11.0 with CUDA 11.1.1 |
| v0.10.0-cuda12.2 | SuperBench v0.10.0 with CUDA 12.2 | | v0.10.0-cuda12.2 | SuperBench v0.10.0 with CUDA 12.2 |
| v0.10.0-cuda11.1.1 | SuperBench v0.10.0 with CUDA 11.1.1 | | v0.10.0-cuda11.1.1 | SuperBench v0.10.0 with CUDA 11.1.1 |
| v0.9.0-cuda12.1 | SuperBench v0.9.0 with CUDA 12.1 | | v0.9.0-cuda12.1 | SuperBench v0.9.0 with CUDA 12.1 |
...@@ -50,6 +53,9 @@ available tags are listed below for all stable versions. ...@@ -50,6 +53,9 @@ available tags are listed below for all stable versions.
| Tag | Description | | Tag | Description |
|-------------------------------|--------------------------------------------------| |-------------------------------|--------------------------------------------------|
| v0.11.0-rocm6.2 | SuperBench v0.11.0 with ROCm 6.2 |
| v0.11.0-rocm6.0 | SuperBench v0.11.0 with ROCm 6.0 |
| v0.10.0-rocm6.0 | SuperBench v0.10.0 with ROCm 6.0 |
| v0.10.0-rocm5.7 | SuperBench v0.10.0 with ROCm 5.7 | | v0.10.0-rocm5.7 | SuperBench v0.10.0 with ROCm 5.7 |
| v0.9.0-rocm5.1.3 | SuperBench v0.9.0 with ROCm 5.1.3 | | v0.9.0-rocm5.1.3 | SuperBench v0.9.0 with ROCm 5.1.3 |
| v0.9.0-rocm5.1.1 | SuperBench v0.9.0 with ROCm 5.1.1 | | v0.9.0-rocm5.1.1 | SuperBench v0.9.0 with ROCm 5.1.1 |
......
...@@ -65,7 +65,7 @@ superbench: ...@@ -65,7 +65,7 @@ superbench:
example: example:
```yaml ```yaml
# SuperBench rules # SuperBench rules
version: v0.10 version: v0.11
superbench: superbench:
rules: rules:
failure-rule: failure-rule:
...@@ -83,8 +83,8 @@ superbench: ...@@ -83,8 +83,8 @@ superbench:
criteria: lambda x:x>0.05 criteria: lambda x:x>0.05
categories: KernelLaunch categories: KernelLaunch
metrics: metrics:
- kernel-launch/event_overhead:\d+ - kernel-launch/event_time:\d+
- kernel-launch/wall_overhead:\d+ - kernel-launch/wall_time:\d+
rule1: rule1:
# Rule 1: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as defective # Rule 1: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as defective
function: variance function: variance
......
...@@ -58,7 +58,7 @@ superbench: ...@@ -58,7 +58,7 @@ superbench:
```yaml title="Example" ```yaml title="Example"
# SuperBench rules # SuperBench rules
version: v0.10 version: v0.11
superbench: superbench:
rules: rules:
kernel_launch: kernel_launch:
...@@ -70,8 +70,8 @@ superbench: ...@@ -70,8 +70,8 @@ superbench:
aggregate: True aggregate: True
categories: KernelLaunch categories: KernelLaunch
metrics: metrics:
- kernel-launch/event_overhead - kernel-launch/event_time
- kernel-launch/wall_overhead - kernel-launch/wall_time
nccl: nccl:
statistics: mean statistics: mean
categories: NCCL categories: NCCL
......
...@@ -168,6 +168,7 @@ def run(self): ...@@ -168,6 +168,7 @@ def run(self):
'openpyxl>=3.0.7', 'openpyxl>=3.0.7',
'packaging>=21.0', 'packaging>=21.0',
'pandas>=1.1.5', 'pandas>=1.1.5',
'protobuf<=3.20.3',
'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4', 'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
'pyyaml>=5.3', 'pyyaml>=5.3',
'requests>=2.27.1', 'requests>=2.27.1',
...@@ -218,7 +219,7 @@ def run(self): ...@@ -218,7 +219,7 @@ def run(self):
'onnxruntime-gpu; python_version>="3.10"', 'onnxruntime-gpu; python_version>="3.10"',
], ],
'nvidia': ['py3nvml>=0.2.6'], 'nvidia': ['py3nvml>=0.2.6'],
'amd': ['pyrsmi>=1.0.1'], 'amd': ['amdsmi'],
} }
), ),
include_package_data=True, include_package_data=True,
......
...@@ -6,5 +6,5 @@ ...@@ -6,5 +6,5 @@
Provide hardware and software benchmarks for AI systems. Provide hardware and software benchmarks for AI systems.
""" """
__version__ = '0.10.0' __version__ = '0.11.0'
__author__ = 'Microsoft' __author__ = 'Microsoft'
...@@ -179,11 +179,11 @@ def get_device_temperature(self, idx): ...@@ -179,11 +179,11 @@ def get_device_temperature(self, idx):
Return: Return:
temp (int): the temperature of device, None means failed to get the data. temp (int): the temperature of device, None means failed to get the data.
""" """
temp = None
try: try:
temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU) temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU)
except Exception as err: except Exception as err:
logger.warning('Get device temperature failed: {}'.format(str(err))) logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp return temp
def get_device_power(self, idx): def get_device_power(self, idx):
...@@ -367,6 +367,7 @@ def get_device_temperature(self, idx): ...@@ -367,6 +367,7 @@ def get_device_temperature(self, idx):
Return: Return:
temp (int): the temperature of device, None means failed to get the data. temp (int): the temperature of device, None means failed to get the data.
""" """
temp = None
try: try:
temp = rocml.amdsmi_get_temp_metric( temp = rocml.amdsmi_get_temp_metric(
self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT
...@@ -375,7 +376,6 @@ def get_device_temperature(self, idx): ...@@ -375,7 +376,6 @@ def get_device_temperature(self, idx):
pass pass
except Exception as err: except Exception as err:
logger.warning('Get device temperature failed: {}'.format(str(err))) logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp return temp
def get_device_power(self, idx): def get_device_power(self, idx):
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Server: # Server:
# - Product: HPE Apollo 6500 # - Product: HPE Apollo 6500
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
var: var:
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
# - Product: G482-Z53 # - Product: G482-Z53
# - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html # - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
var: var:
......
# SuperBench Config
version: v0.11
superbench:
enable: null
var:
default_local_mode: &default_local_mode
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
default_pytorch_mode: &default_pytorch_mode
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: 1
frameworks:
- pytorch
common_model_config: &common_model_config
model_ddp_parameter: &model_ddp_param
duration: 0
num_warmup: 128
num_steps: 512
sample_count: 8192
batch_size: 128
precision: [float32, float16]
model_action: [train]
pin_memory: yes
num_workers: 0
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
parameters:
m: 7680
n: 8192
k: 8192
hipblaslt-gemm:
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
parameters:
in_types: ["fp32", "fp16", "bf16", 'fp8']
tolerant_fail: yes
num_warmup: 100
num_steps: 1000
shapes:
- 4096,4096,4096
- 8192,8192,8192
- 16384,16384,16384
rccl-bw:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 1
mca:
pml: ob1
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
parameters:
maxbytes: 16G
ngpus: 1
operation: allreduce
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
parallel: no
ib-loopback:
enable: true
modes:
- name: local
proc_num: 16
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
parallel: no
parameters:
msg_size: 8388608
disk-benchmark:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
block_devices: []
gpu-copy-bw:correctness:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
size: 4096
num_warm_up: 0
num_loops: 1
check_data: true
gpu-copy-bw:perf:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
ib-traffic:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
btl: tcp,self
pml: ob1
btl_tcp_if_include: ens17f0
gpcnet-network-test:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
pml: ucx
btl: ^uct
btl_tcp_if_include: ens17f0
tcp-connectivity:
enable: false
modes:
- name: local
parallel: no
parameters:
port: 22
dist-inference:
modes:
- name: mpi
proc_num: 8
node_num: 1
mca:
pml: ob1
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
frameworks:
- pytorch
parameters:
num_layers: 50
num_warmup: 20
num_steps: 100
use_cuda_graph: true
precision: float16
hidden_size: 128
input_size: 128
batch_size: 1024
model-benchmarks:gpt:
enable: true
<<: *default_pytorch_mode
models:
- gpt2-small
- gpt2-large
parameters:
<<: *model_ddp_param
precision: [float32, float16, fp8_hybrid]
batch_size: 32
seq_len: 224
model-benchmarks:bert:
enable: true
<<: *default_pytorch_mode
models:
- bert-base
- bert-large
parameters:
<<: *model_ddp_param
precision: [float32, float16, fp8_hybrid]
seq_len: 224
model-benchmarks:lstm:
enable: true
<<: *default_pytorch_mode
models:
- lstm
parameters:
<<: *model_ddp_param
batch_size: 1024
input_size: 224
hidden_size: 1000
seq_len: 32
model-benchmarks:resnet:
enable: true
<<: *default_pytorch_mode
models:
- resnet50
- resnet101
- resnet152
parameters:
<<: *model_ddp_param
batch_size: 384
model-benchmarks:densenet:
enable: true
<<: *default_pytorch_mode
models:
- densenet169
- densenet201
parameters:
<<: *model_ddp_param
model-benchmarks:vgg:
enable: true
<<: *default_pytorch_mode
models:
- vgg11
- vgg13
- vgg16
- vgg19
parameters:
<<: *model_ddp_param
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment