Unverified Commit 949f9cb4 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Release - SuperBench v0.11.0 (#654)



**Description**
Cherry pick bug fixes from v0.11.0 to main

**Major Revision**
* #645 
* #648 
* #646 
* #647 
* #651 
* #652 
* #650

---------
Co-authored-by: default avatarhongtaozhang <hongtaozhang@microsoft.com>
Co-authored-by: default avatarYifan Xiong <yifan.xiong@microsoft.com>
parent 9f3231e9
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Azure NDm A100 v4 # Azure NDm A100 v4
# reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
# SuperBench Config # SuperBench Config
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
# SuperBench Config
version: v0.11
superbench:
enable:
monitor:
enable: true
sample_duration: 1
sample_interval: 10
var:
default_local_mode: &default_local_mode
enable: true
modes:
- name: local
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
parallel: yes
default_pytorch_mode: &default_pytorch_mode
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: 1
frameworks:
- pytorch
common_model_config: &common_model_config
duration: 0
num_warmup: 128
num_steps: 512
sample_count: 8192
batch_size: 128
precision: [float32, float16]
model_action: [train]
pin_memory: yes
num_workers: 0
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
parameters:
precision: ["fp64", "fp32", "fp16", "fp64_tc","tf32_tc", "bf16_tc", "fp16_tc", "int8_tc"]
cublaslt-gemm:
<<: *default_local_mode
parameters:
in_types: ['fp8e4m3', 'fp8e5m2', 'fp64', 'fp32', 'fp16', 'bf16', 'int8']
shapes:
- 4096,4096,4096
- 8192,8192,8192
- 16384,16384,16384
gpu-burn:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
time: 900
doubles: true
tensor_core: true
nccl-bw:default:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
parameters:
ngpus: 8
nccl-bw:gdr-only:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
env:
NCCL_IB_PCI_RELAXED_ORDERING: '1'
NCCL_NET_GDR_LEVEL: '5'
NCCL_P2P_DISABLE: '1'
NCCL_SHM_DISABLE: '1'
NCCL_MIN_NCHANNELS: '16'
NCCL_IB_DISABLE: '0'
parameters:
ngpus: 8
nccl-lat:default:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 1
parameters:
maxbytes: 16M
warmup_iters: 20
iters: 1000
graph_iters: 1
ib-loopback:
timeout: *default_timeout
modes:
- name: local
proc_num: 4
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=0,0,1,1
parallel: yes
- name: local
proc_num: 4
prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=0,0,1,1
parallel: yes
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:
- name: local
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: no
disk-benchmark:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
block_devices:
- /dev/nvme0n1
- /dev/nvme1n1
- /dev/nvme2n1
- /dev/nvme3n1
- /dev/nvme4n1
- /dev/nvme5n1
- /dev/nvme6n1
- /dev/nvme7n1
seq_read_runtime: 60
seq_write_runtime: 60
seq_readwrite_runtime: 60
rand_read_runtime: 60
rand_write_runtime: 60
rand_readwrite_runtime: 60
gpu-copy-bw:correctness:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
size: 4096
num_warm_up: 0
num_loops: 1
check_data: true
gpu-copy-bw:perf:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
cudnn-function:
<<: *default_local_mode
cublas-function:
<<: *default_local_mode
matmul:
<<: *default_local_mode
frameworks:
- pytorch
sharding-matmul:
<<: *default_pytorch_mode
computation-communication-overlap:
<<: *default_pytorch_mode
dist-inference:
enable: true
timeout: 600
modes:
- name: mpi
proc_num: 8
node_num: 1
env:
NCCL_TOPO_FILE: '/opt/microsoft/ndv5-topo.xml'
frameworks:
- pytorch
parameters:
num_layers: 50
num_warmup: 20
num_steps: 100
use_cuda_graph: true
precision: float16
hidden_size: 128
input_size: 128
batch_size: 1024
ib-traffic:
enable: false
modes:
- name: mpi
proc_num: 8
parameters:
msg_size: 8388608
ib_dev: mlx5_$LOCAL_RANK
gpu_dev: $LOCAL_RANK
numa_dev: $((LOCAL_RANK/2))
gpcnet-network-test:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
pml: ucx
btl: ^uct
btl_tcp_if_include: eth0
gpcnet-network-load-test:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
pml: ucx
btl: ^uct
btl_tcp_if_include: eth0
tcp-connectivity:
enable: false
modes:
- name: local
parallel: no
parameters:
port: 22
ort-inference:
<<: *default_local_mode
tensorrt-inference:
<<: *default_local_mode
parameters:
pytorch_models:
- resnet50
- resnet101
- resnet152
- densenet169
- densenet201
- bert-base
- bert-large
seq_length: 224
batch_size: 32
precision: int8
model-benchmarks:gpt:
<<: *default_pytorch_mode
models:
- gpt2-small
- gpt2-large
parameters:
<<: *common_model_config
precision: [float32, float16, fp8_hybrid]
batch_size: 32
seq_len: 224
model-benchmarks:bert:
<<: *default_pytorch_mode
models:
- bert-base
- bert-large
parameters:
<<: *common_model_config
precision: [float32, float16, fp8_hybrid]
seq_len: 224
model-benchmarks:lstm:
<<: *default_pytorch_mode
models:
- lstm
parameters:
<<: *common_model_config
batch_size: 1024
input_size: 224
hidden_size: 1000
seq_len: 32
pin_memory: no
model-benchmarks:resnet:
<<: *default_pytorch_mode
models:
- resnet50
- resnet101
- resnet152
parameters:
<<: *common_model_config
batch_size: 384
num_steps: 512
model-benchmarks:densenet:
<<: *default_pytorch_mode
models:
- densenet169
- densenet201
parameters:
<<: *common_model_config
pin_memory: no
model-benchmarks:vgg:
<<: *default_pytorch_mode
models:
- vgg11
- vgg13
- vgg16
- vgg19
parameters:
<<: *common_model_config
pin_memory: no
# SuperBench Config # SuperBench Config
version: v0.10 version: v0.11
superbench: superbench:
enable: null enable: null
monitor: monitor:
......
---
slug: release-sb-v0.11
title: Releasing SuperBench v0.11
author: Peng Cheng
author_title: SuperBench Team
author_url: https://github.com/cp5555
author_image_url: https://github.com/cp5555.png
tags: [superbench, announcement, release]
---
We are very happy to announce that **SuperBench 0.11.0 version** is officially released today!
You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
## SuperBench 0.11.0 Release Notes
### SuperBench Improvements
- Add CUDA 12.4 dockerfile.
- Upgrade nccl version to v2.23.4 and install ucx v1.16.0 in cuda 12.4 dockefile.
- Fix MSCCL build error in CUDA12.4 docker build pipeline.
- Add ROCm6.2 dockerfile.
- Update hpcx link in cuda11.1 dockerfile to fix docker build failure.
- Improve document (Fix metrics name and typos in user tutorial, add BibTeX in README and repo).
- Limit protobuf version to be 3.20.x to fix onnxruntime dependency error.
- Update omegaconf version to 2.3.0 and fix issues caused by omegaconf version update.
- Fix MSCCL build error in CUDA12.4 docker build pipeline.
- Update Docker Exec Command for Persistent HPCX Environment.
- Fix cuda 12.2 dockerfile LD_LIBRARY_PATH issue.
- Use types-setuptools to replace types-pkg_resources.
- Add configuration for NDv5 H100 and AMD MI300x.
### Micro-benchmark Improvements
- Add hipblasLt tuning to dist-inference cpp implementation.
- Add support for NVIDIA L4/L40/L40s GPUs in gemm-flops.
- Upgrade mlc to v3.11.
## Model-benchmark Improvements
- Support FP8 transformer model training in ROCm6.2 dockerfile.
### Result Analysis
- Fix bug of failure test and warning of pandas in data diagnosis.
...@@ -101,7 +101,7 @@ module.exports = { ...@@ -101,7 +101,7 @@ module.exports = {
announcementBar: { announcementBar: {
id: 'supportus', id: 'supportus',
content: content:
'📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.10">v0.10.0</a> has been released! ' + '📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.11">v0.11.0</a> has been released! ' +
'⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️', '⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️',
}, },
algolia: { algolia: {
......
{ {
"name": "superbench-website", "name": "superbench-website",
"version": "0.10.0", "version": "0.11.0",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {
......
{ {
"name": "superbench-website", "name": "superbench-website",
"version": "0.10.0", "version": "0.11.0",
"private": true, "private": true,
"scripts": { "scripts": {
"docusaurus": "docusaurus", "docusaurus": "docusaurus",
...@@ -38,4 +38,4 @@ ...@@ -38,4 +38,4 @@
"last 1 safari version" "last 1 safari version"
] ]
} }
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment