Commit 49a4389b authored by one's avatar one
Browse files

Add bw1000 config files (beta)

parent 53e0e494
# SuperBench Config
version: v0.12
superbench:
enable: null
monitor:
enable: false
var:
default_local_mode: &default_local_mode
enable: false
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
default_pytorch_mode: &default_pytorch_mode
enable: false
modes:
- name: torch.distributed
proc_num: 8
node_num: 1
frameworks:
- pytorch
common_model_config: &common_model_config
model_ddp_parameter: &model_ddp_param
duration: 0
num_warmup: 128
num_steps: 512
sample_count: 8192
batch_size: 128
precision: [float32, float16]
model_action: [train]
pin_memory: yes
num_workers: 0
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
parameters:
m: 7680
n: 8192
k: 8192
hipblaslt-gemm:
enable: false
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
parameters:
in_types: ["fp32", "fp16", "bf16"]
tolerant_fail: yes
num_warmup: 100
num_steps: 1000
shapes:
- 4096,4096,4096
- 8192,8192,8192
- 16384,16384,16384
gpu-stream:
enable: false
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
parameters:
array_size: 268435456
num_loops: 100
precision: double
rccl-bw:allreduce-r16:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 2
bind_to: none
mca:
pml: ucx
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
env:
ROCM_PATH: /opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE: 1
NCCL_SOCKET_IFNAME: p14p2
NCCL_NET_GDR_LEVEL: PHB
NCCL_NET_GDR_READ: 1
parameters:
maxbytes: 16G
ngpus: 1
operation: allreduce
rccl-bw:allreduce-r8-pcie:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 1
bind_to: none
mca:
pml: ucx
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
env:
ROCM_PATH: /opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE: 1
NCCL_SOCKET_IFNAME: p14p2
NCCL_NET_GDR_LEVEL: PHB
NCCL_NET_GDR_READ: 1
NCCL_BUFFSIZE: 4194304
NCCL_SIMPLE_CHANNELS: 32
RCCL_P2P_XHCL_CHANNEL_NUM: 31
RCCL_COLL_XHCL_CHANNEL_NUM: 28
parameters:
maxbytes: 16G
ngpus: 1
operation: allreduce
rccl-bw:allreduce-r8:
enable: false
modes:
- name: mpi
proc_num: 8
node_num: 1
bind_to: none
mca:
pml: ucx
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
env:
ROCM_PATH: /opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE: 1
NCCL_SOCKET_IFNAME: p14p2
NCCL_NET_GDR_LEVEL: PHB
NCCL_NET_GDR_READ: 1
NCCL_BUFFSIZE: 4194304
parameters:
maxbytes: 16G
ngpus: 1
operation: allreduce
rccl-bw:alltoall-r16:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 2
bind_to: none
mca:
pml: ucx
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
env:
ROCM_PATH: /opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE: 1
NCCL_SOCKET_IFNAME: p14p2
NCCL_NET_GDR_LEVEL: PHB
NCCL_NET_GDR_READ: 1
parameters:
maxbytes: 16G
ngpus: 1
operation: alltoall
gpu-hpcg:r32:
enable: false
modes:
- name: mpi
proc_num: 8
node_num: 4
bind_to: none
mca:
pml: ucx
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
env:
ROCM_PATH: /opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE: 1
parameters:
nx: 560
ny: 280
nz: 280
rt: 60
npx: 4
npy: 4
npz: 2
gpu-hpcg:r16:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 2
bind_to: none
mca:
pml: ucx
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
env:
ROCM_PATH: /opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE: 1
parameters:
nx: 560
ny: 280
nz: 280
rt: 0
npx: 4
npy: 2
npz: 2
gpu-hpcg:r8:
enable: false
modes:
- name: mpi
proc_num: 8
node_num: 1
bind_to: none
parameters:
nx: 560
ny: 280
nz: 280
rt: 60
npx: 2
npy: 2
npz: 2
gpu-hpcg:r4:
enable: false
modes:
- name: mpi
proc_num: 4
node_num: 1
bind_to: none
parameters:
nx: 560
ny: 280
nz: 280
rt: 60
npx: 2
npy: 2
npz: 1
gpu-hpcg:r2:
enable: false
modes:
- name: mpi
proc_num: 2
node_num: 1
bind_to: none
parameters:
nx: 560
ny: 280
nz: 280
rt: 60
npx: 2
npy: 1
npz: 1
gpu-hpcg:r1:
enable: false
modes:
- name: mpi
proc_num: 1
node_num: 1
bind_to: none
parameters:
nx: 560
ny: 280
nz: 280
rt: 60
npx: 1
npy: 1
npz: 1
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: false
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
parallel: no
ib-loopback:
enable: false
modes:
- name: local
proc_num: 16
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
parallel: no
parameters:
msg_size: 8388608
disk-benchmark:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
block_devices: []
gpu-copy-bw:correctness:
enable: false
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
size: 4096
num_warm_up: 0
num_loops: 1
check_data: true
gpu-copy-bw:perf:
enable: false
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
ib-traffic:
enable: false
modes:
- name: mpi
proc_num: 8
parameters:
command:
- ib_write_bw
- ib_write_lat
direction:
- gpu-to-gpu
pattern: one-to-one
msg_size: 8388608
iters: 5000
timeout: 120
gpu_dev: $LOCAL_RANK
ib_dev: '"$(case $LOCAL_RANK in 0) echo mlx5_1 ;; 1) echo mlx5_2 ;; 2) echo mlx5_3 ;; 3) echo mlx5_4 ;; 4) echo mlx5_7 ;; 5) echo mlx5_8 ;; 6) echo mlx5_9 ;; 7) echo mlx5_10 ;; esac)"'
numa_dev: '"$(case $LOCAL_RANK in 0) echo 3 ;; 1) echo 1 ;; 2) echo 1 ;; 3) echo 0 ;; 4) echo 7 ;; 5) echo 5 ;; 6) echo 5 ;; 7) echo 4 ;; esac)"'
bidirectional: false
# dist-inference:
# modes:
# - name: mpi
# proc_num: 8
# node_num: 1
# mca:
# pml: ob1
# btl: ^openib
# btl_tcp_if_exclude: lo,docker0
# coll_hcoll_enable: 0
# frameworks:
# - pytorch
# parameters:
# num_layers: 50
# num_warmup: 20
# num_steps: 100
# use_cuda_graph: true
# precision: float16
# hidden_size: 128
# input_size: 128
# batch_size: 1024
# SuperBench summary rules for hygon_bw1000.yaml
#
# Usage:
# sb result summary \
# --data-file <output-dir>/results-summary.jsonl \
# --rule-file superbench/config/hygon_bw1000_summary.yaml \
# --output-file-format md \
# --output-dir <summary-output-dir>
#
# Notes:
# - This file focuses on RCCL benchmarks defined in
# superbench/config/hygon_bw1000.yaml.
# - Unmatched benchmark sections are allowed. If a benchmark was not run,
# the corresponding category in the summary will be empty.
version: v0.12
superbench:
rules:
rccl_bw_allreduce_r16_bandwidth:
statistics: mean
categories: RCCL rccl-bw:allreduce-r16
metrics:
- rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
- rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_allreduce_r8_pcie_bandwidth:
statistics: mean
categories: RCCL rccl-bw:allreduce-r8-pcie
metrics:
- rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
- rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_allreduce_r8_bandwidth:
statistics: mean
categories: RCCL rccl-bw:allreduce-r8
metrics:
- rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
- rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_alltoall_r16_bandwidth:
statistics: mean
categories: RCCL rccl-bw:alltoall-r16
metrics:
- rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
- rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment