Unverified Commit dfbd70b1 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Release - SuperBench v0.3.0 (#212)



**Description**

Cherry-pick  bug fixes from v0.3.0 to main.

**Major Revisions**
* Docs - Upgrade version and release note (#209)
* Benchmarks: Build Pipeline - Update rccl-test git submodule to dc1ad48 (#210)
* Benchmarks: Update - Update benchmarks in configuration file (#208)
* CI/CD - Update GitHub Action VM (#211)
* Benchmarks: Fix Bug - Fix wrong parameters for gpu-sm-copy-bw in configuration examples (#203)
* CI/CD - Fix bug in build image for push event (#205)
* Benchmark: Fix Bug - fix error message of communication-computation-overlap (#204)
* Tool: Fix bug - Fix function naming issue in system info  (#200)
* CI/CD - Push images in GitHub Action (#202)
* Bug - Fix torch.distributed command for single node (#201)
* CLI - Integrate system info for node (#199)
* Benchmarks: Code Revision - Revise CMake files for microbenchmarks. (#196)
* CI/CD - Add ROCm image build in GitHub Actions (#194)
* Bug: Fix bug - fix bug of hipBusBandwidth build (#193)
* Benchmarks: Build Pipeline - Restore rocblas build logic (#197)
* Bug: Fix Bug - Add barrier before 'destroy_process_group' in model benchmarks (#198)
* Bug - Revise 'docker run' in sb deploy (#195)
* Bug - Fix Bug : fix bug of error param operations to operation in rccl-bw of hpe config (#190)
Co-authored-by: default avatarYuting Jiang <v-yujiang@microsoft.com>
Co-authored-by: default avatarGuoshuai Zhao <guzhao@microsoft.com>
Co-authored-by: default avatarZiyue Yang <ziyyang@microsoft.com>
parent 37b15db9
......@@ -3,7 +3,7 @@
# Server:
# - Product: HPE Apollo 6500
version: v0.2
version: v0.3
superbench:
enable: null
var:
......@@ -40,24 +40,20 @@ superbench:
rccl-bw:
enable: true
modes:
- name: mpi
proc_num: 8
env:
NCCL_SOCKET_IFNAME: ens17f0
NCCL_IB_GDR_LEVEL: 1
- name: local
proc_num: 1
parallel: no
parameters:
maxbytes: 128M
minbytes: 32M
iters: 50
ngpus: 1
operations: allreduce
maxbytes: 8G
ngpus: 8
operation: allreduce
mem-bw:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
parameters:
m: 7680
n: 8192
m: 7680
n: 8192
k: 8192
ib-loopback:
enable: true
......@@ -75,15 +71,16 @@ superbench:
parameters:
block_devices: []
gpu-sm-copy-bw:
enable: false
enable: true
modes:
- name: local
proc_num: 32
prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
dtoh: true
htod: true
mem_type:
- dtoh
- htod
gpt_models:
<<: *default_pytorch_mode
models:
......
......@@ -4,7 +4,7 @@
# - Product: G482-Z53
# - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
version: v0.2
version: v0.3
superbench:
enable: null
var:
......@@ -13,7 +13,7 @@ superbench:
modes:
- name: local
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
default_pytorch_mode: &default_pytorch_mode
enable: true
......@@ -36,6 +36,52 @@ superbench:
- train
pin_memory: yes
benchmarks:
kernel-launch:
<<: *default_local_mode
rccl-bw:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
parameters:
maxbytes: 8G
ngpus: 8
operation: allreduce
mem-bw:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
parameters:
m: 7680
n: 8192
k: 8192
ib-loopback:
enable: true
modes:
- name: local
proc_num: 2
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1
parallel: no
disk-benchmark:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
block_devices: []
gpu-sm-copy-bw:
enable: true
modes:
- name: local
proc_num: 32
prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
mem_type:
- dtoh
- htod
gpt_models:
<<: *default_pytorch_mode
models:
......
# SuperBench Config
version: v0.2
version: v0.3
superbench:
enable: null
var:
......@@ -35,6 +35,51 @@ superbench:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
nccl-bw:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
parameters:
ngpus: 8
ib-loopback:
enable: true
modes:
- name: local
proc_num: 4
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2
parallel: yes
- name: local
proc_num: 4
prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
parallel: yes
mem-bw:
enable: true
modes:
- name: local
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
parallel: yes
disk-benchmark:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
block_devices: []
gpu-sm-copy-bw:
enable: true
modes:
- name: local
proc_num: 32
prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
mem_type:
- dtoh
- htod
cudnn-function:
<<: *default_local_mode
cublas-function:
......
# SuperBench Config
version: v0.2
version: v0.3
superbench:
enable: null
var:
......@@ -32,7 +32,10 @@ superbench:
enable: true
modes:
- name: local
prefix: NCCL_DEBUG=INFO NCCL_IB_DISABLE=1
proc_num: 1
parallel: no
parameters:
ngpus: 8
ib-loopback:
enable: true
modes:
......@@ -61,15 +64,16 @@ superbench:
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
parallel: yes
gpu-sm-copy-bw:
enable: false
enable: true
modes:
- name: local
proc_num: 32
prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
dtoh: true
htod: true
mem_type:
- dtoh
- htod
kernel-launch:
<<: *default_local_mode
gemm-flops:
......
......@@ -101,7 +101,7 @@
{{ '--security-opt seccomp=unconfined --group-add video' if amd_gpu_exist else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
-v /var/run/docker.sock:/var/run/docker.sock \
{{ docker_image }} bash && \
--entrypoint /bin/bash {{ docker_image }} && \
docker exec {{ container }} bash -c \
"chown -R root:root ~ && \
sed -i 's/[# ]*Port.*/Port {{ ssh_port }}/g' /etc/ssh/sshd_config && \
......
......@@ -123,20 +123,13 @@ def __get_mode_command(self, benchmark_name, mode):
elif mode.name == 'torch.distributed':
# TODO: replace with torch.distributed.run in v1.9
# TODO: only supports node_num=1 and node_num=all currently
torch_dist_params = '' if mode.node_num == 1 else \
'--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
mode_command = (
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node={proc_num} '
'--nnodes={node_num} --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'{command} {torch_distributed_suffix}'
).format(
proc_num=mode.proc_num,
node_num=1 if mode.node_num == 1 else '$NNODES',
command=exec_command,
torch_distributed_suffix=(
'superbench.benchmarks.{name}.parameters.distributed_impl=ddp '
'superbench.benchmarks.{name}.parameters.distributed_backend=nccl'
).format(name=benchmark_name),
f'python3 -m torch.distributed.launch'
f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
)
elif mode.name == 'mpi':
mode_command = (
......
This diff is collapsed.
......@@ -81,3 +81,7 @@ def test_sb_run_nonexist_host_file(self):
"""Test sb run, --host-file does not exist, should fail."""
result = self.cmd('sb run --host-file ./nonexist.yaml', expect_failure=True)
self.assertEqual(result.exit_code, 1)
def test_sb_node_info(self):
"""Test sb node info, should fail."""
self.cmd('sb node info', expect_failure=False)
......@@ -116,8 +116,6 @@ def test_get_mode_command(self):
'expected_command': (
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=8 '
'--nnodes=1 --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
......
......@@ -8,7 +8,6 @@ MPI_HOME ?= /usr/local/mpi
HIP_HOME ?= /opt/rocm/hip
RCCL_HOME ?= /opt/rocm/rccl
ROCM_VERSION ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
ROCM_ARCH ?= $(shell rocminfo | grep " gfx" | uniq | awk '{print $$2}')
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
......@@ -66,7 +65,7 @@ ifneq (,$(wildcard fio/Makefile))
cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) && make -j && make install
endif
# Build rccl-tests from commit cc34c5 of develop branch (default branch).
# Build rccl-tests from commit dc1ad48 of develop branch (default branch).
rocm_rccl_tests: sb_micro_path
ifneq (, $(wildcard rccl-tests/Makefile))
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) HIP_HOME=$(HIP_HOME) RCCL_HOME=$(RCCL_HOME) -j
......@@ -81,21 +80,14 @@ rocm_rocblas: sb_micro_path
ifeq (, $(wildcard $(SB_MICRO_PATH)/bin/rocblas-bench))
if [ -d rocBLAS ]; then rm -rf rocBLAS; fi
git clone -b ${ROCM_VERSION} https://github.com/ROCmSoftwarePlatform/rocBLAS.git ./rocBLAS
ifeq (${ROCM_VERSION}, rocm-4.0.0)
sed -i '/CMAKE_MATCH_1/a\ get_filename_component(HIP_CLANG_ROOT "$${HIP_CLANG_ROOT}" DIRECTORY)' /opt/rocm/hip/lib/cmake/hip/hip-config.cmake
cd ./rocBLAS && HIPCC_COMPILE_FLAGS_APPEND="-D_OPENMP=201811 -O3 -Wno-format-nonliteral -DCMAKE_HAVE_LIBC_PTHREAD -parallel-jobs=2" HIPCC_LINK_FLAGS_APPEND="-lpthread -O3 -parallel-jobs=2" ./install.sh -idc -a ${ROCM_ARCH}
else
cd ./rocBLAS && ./install.sh -idc
endif
cd ./rocBLAS && ./install.sh --dependencies --clients-only
cp -v ./rocBLAS/build/release/clients/staging/rocblas-bench $(SB_MICRO_PATH)/bin/
endif
# Build hipBusBandwidth.
# HIP is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
rocm_bandwidthTest:
cp -r -v $(shell hipconfig -p) ./
ifneq (, $(wildcard hip/samples/1_Utils/hipBusBandwidth/CMakeLists.txt))
cd ./hip/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
cp -v ./hip/samples/1_Utils/hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
endif
rocm_bandwidthTest: sb_micro_path
cp -r -v $(shell hipconfig -p)/samples/1_Utils/hipBusBandwidth ./
cd ./hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
cp -v ./hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
Subproject commit cc34c545098145bc148e5035e4c8e767b4d71ece
Subproject commit dc1ad4853d7ec738387d42a75a58a98d7af00c7b
---
slug: release-sb-v0.3
title: Releasing SuperBench v0.3
author: Peng Cheng
author_title: SuperBench Team
author_url: https://github.com/cp5555
author_image_url: https://github.com/cp5555.png
tags: [superbench, announcement, release]
---
We are very happy to announce that **SuperBench 0.3.0 version** is officially released today!
You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
## SuperBench 0.3.0 Release Notes
### SuperBench Framework
#### Runner
- Implement MPI mode.
#### Benchmarks
- Support Docker benchmark.
### Single-node Validation
#### Micro Benchmarks
1. Memory (Tool: NVIDIA/AMD Bandwidth Test Tool)
| Metrics | Unit | Description |
|----------------|------|-------------------------------------|
| H2D_Mem_BW_GPU | GB/s | host-to-GPU bandwidth for each GPU |
| D2H_Mem_BW_GPU | GB/s | GPU-to-host bandwidth for each GPU |
2. IBLoopback (Tool: PerfTest – Standard RDMA Test Tool)
| Metrics | Unit | Description |
|----------|------|---------------------------------------------------------------|
| IB_Write | MB/s | The IB write loopback throughput with different message sizes |
| IB_Read | MB/s | The IB read loopback throughput with different message sizes |
| IB_Send | MB/s | The IB send loopback throughput with different message sizes |
3. NCCL/RCCL (Tool: NCCL/RCCL Tests)
| Metrics | Unit | Description |
|---------------------|------|-----------------------------------------------------------------|
| NCCL_AllReduce | GB/s | The NCCL AllReduce performance with different message sizes |
| NCCL_AllGather | GB/s | The NCCL AllGather performance with different message sizes |
| NCCL_broadcast | GB/s | The NCCL Broadcast performance with different message sizes |
| NCCL_reduce | GB/s | The NCCL Reduce performance with different message sizes |
| NCCL_reduce_scatter | GB/s | The NCCL ReduceScatter performance with different message sizes |
4. Disk (Tool: FIO – Standard Disk Performance Tool)
| Metrics | Unit | Description |
|----------------|------|---------------------------------------------------------------------------------|
| Seq_Read | MB/s | Sequential read performance |
| Seq_Write | MB/s | Sequential write performance |
| Rand_Read | MB/s | Random read performance |
| Rand_Write | MB/s | Random write performance |
| Seq_R/W_Read | MB/s | Read performance in sequential read/write, fixed measurement (read:write = 4:1) |
| Seq_R/W_Write | MB/s | Write performance in sequential read/write (read:write = 4:1) |
| Rand_R/W_Read | MB/s | Read performance in random read/write (read:write = 4:1) |
| Rand_R/W_Write | MB/s | Write performance in random read/write (read:write = 4:1) |
5. H2D/D2H SM Transmission Bandwidth (Tool: MSR-A build)
| Metrics | Unit | Description |
|---------------|------|-----------------------------------------------------|
| H2D_SM_BW_GPU | GB/s | host-to-GPU bandwidth using GPU kernel for each GPU |
| D2H_SM_BW_GPU | GB/s | GPU-to-host bandwidth using GPU kernel for each GPU |
### AMD GPU Support
#### Docker Image Support
- ROCm 4.2 PyTorch 1.7.0
- ROCm 4.0 PyTorch 1.7.0
#### Micro Benchmarks
1. Kernel Launch (Tool: MSR-A build)
| Metrics | Unit | Description |
|--------------------------|-----------|--------------------------------------------------------------|
| Kernel_Launch_Event_Time | Time (ms) | Dispatch latency measured in GPU time using hipEventRecord() |
| Kernel_Launch_Wall_Time | Time (ms) | Dispatch latency measured in CPU time |
2. GEMM FLOPS (Tool: AMD rocblas-bench Tool)
| Metrics | Unit | Description |
|----------|--------|-------------------------------|
| FP64 | GFLOPS | FP64 FLOPS without MatrixCore |
| FP32(MC) | GFLOPS | TF32 FLOPS with MatrixCore |
| FP16(MC) | GFLOPS | FP16 FLOPS with MatrixCore |
| BF16(MC) | GFLOPS | BF16 FLOPS with MatrixCore |
| INT8(MC) | GOPS | INT8 FLOPS with MatrixCore |
#### E2E Benchmarks
1. CNN models -- Use PyTorch torchvision models
- ResNet: ResNet-50, ResNet-101, ResNet-152
- DenseNet: DenseNet-169, DenseNet-201
- VGG: VGG-11, VGG-13, VGG-16, VGG-19​
2. BERT -- Use huggingface Transformers
- BERT
- BERT Large
3. LSTM -- Use PyTorch
4. GPT-2 -- Use huggingface Transformers
### Bug Fix
- VGG models failed on A100 GPU with batch_size=128
### Other Improvement
1. Contribution related
- Contribute rule
- System information collection
2. Document
- Add release process doc
- Add design documents
- Add developer guide doc for coding style
- Add contribution rules
- Add docker image list
- Add initial validation results
......@@ -101,7 +101,7 @@ module.exports = {
announcementBar: {
id: 'supportus',
content:
'📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.2">v0.2.1</a> has been released! ' +
'📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.3">v0.3.0</a> has been released! ' +
'⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️',
},
algolia: {
......
{
"name": "superbench-website",
"version": "0.2.1",
"version": "0.3.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
......
{
"name": "superbench-website",
"version": "0.2.1",
"version": "0.3.0",
"private": true,
"scripts": {
"docusaurus": "docusaurus",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment