Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
49a4389b
Commit
49a4389b
authored
Apr 02, 2026
by
one
Browse files
Add bw1000 config files (beta)
parent
53e0e494
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
408 additions
and
0 deletions
+408
-0
superbench/config/hygon_bw1000.yaml
superbench/config/hygon_bw1000.yaml
+363
-0
superbench/config/hygon_bw1000_summary.yaml
superbench/config/hygon_bw1000_summary.yaml
+45
-0
No files found.
superbench/config/hygon_bw1000.yaml
0 → 100644
View file @
49a4389b
# SuperBench Config
version
:
v0.12
superbench
:
enable
:
null
monitor
:
enable
:
false
var
:
default_local_mode
:
&default_local_mode
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
parallel
:
yes
default_pytorch_mode
:
&default_pytorch_mode
enable
:
false
modes
:
-
name
:
torch.distributed
proc_num
:
8
node_num
:
1
frameworks
:
-
pytorch
common_model_config
:
&common_model_config
model_ddp_parameter
:
&model_ddp_param
duration
:
0
num_warmup
:
128
num_steps
:
512
sample_count
:
8192
batch_size
:
128
precision
:
[
float32
,
float16
]
model_action
:
[
train
]
pin_memory
:
yes
num_workers
:
0
benchmarks
:
kernel-launch
:
<<
:
*default_local_mode
gemm-flops
:
<<
:
*default_local_mode
parameters
:
m
:
7680
n
:
8192
k
:
8192
hipblaslt-gemm
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
parallel
:
yes
parameters
:
in_types
:
[
"
fp32"
,
"
fp16"
,
"
bf16"
]
tolerant_fail
:
yes
num_warmup
:
100
num_steps
:
1000
shapes
:
-
4096,4096,4096
-
8192,8192,8192
-
16384,16384,16384
gpu-stream
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
parallel
:
yes
parameters
:
array_size
:
268435456
num_loops
:
100
precision
:
double
rccl-bw:allreduce-r16:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
2
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
allreduce
rccl-bw:allreduce-r8-pcie:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
1
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
NCCL_BUFFSIZE
:
4194304
NCCL_SIMPLE_CHANNELS
:
32
RCCL_P2P_XHCL_CHANNEL_NUM
:
31
RCCL_COLL_XHCL_CHANNEL_NUM
:
28
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
allreduce
rccl-bw:allreduce-r8:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
1
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
NCCL_BUFFSIZE
:
4194304
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
allreduce
rccl-bw:alltoall-r16:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
2
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
alltoall
gpu-hpcg:r32:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
4
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
4
npy
:
4
npz
:
2
gpu-hpcg:r16:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
2
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
0
npx
:
4
npy
:
2
npz
:
2
gpu-hpcg:r8:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
2
npy
:
2
npz
:
2
gpu-hpcg:r4:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
4
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
2
npy
:
2
npz
:
1
gpu-hpcg:r2:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
2
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
2
npy
:
1
npz
:
1
gpu-hpcg:r1:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
1
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
1
npy
:
1
npz
:
1
cpu-memory-bw-latency
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
1
parallel
:
no
parameters
:
tests
:
-
bandwidth_matrix
-
latency_matrix
-
max_bandwidth
mem-bw
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
parallel
:
no
ib-loopback
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
16
prefix
:
PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
parallel
:
no
parameters
:
msg_size
:
8388608
disk-benchmark
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
1
parallel
:
no
parameters
:
block_devices
:
[]
gpu-copy-bw:correctness:
enable
:
false
modes
:
-
name
:
local
parallel
:
no
parameters
:
mem_type
:
[
htod
,
dtoh
,
dtod
,
one_to_all
,
all_to_one
,
all_to_all
]
copy_type
:
[
sm
,
dma
]
size
:
4096
num_warm_up
:
0
num_loops
:
1
check_data
:
true
gpu-copy-bw:perf:
enable
:
false
modes
:
-
name
:
local
parallel
:
no
parameters
:
mem_type
:
[
htod
,
dtoh
,
dtod
,
one_to_all
,
all_to_one
,
all_to_all
]
copy_type
:
[
sm
,
dma
]
ib-traffic
:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
parameters
:
command
:
-
ib_write_bw
-
ib_write_lat
direction
:
-
gpu-to-gpu
pattern
:
one-to-one
msg_size
:
8388608
iters
:
5000
timeout
:
120
gpu_dev
:
$LOCAL_RANK
ib_dev
:
'
"$(case
$LOCAL_RANK
in
0)
echo
mlx5_1
;;
1)
echo
mlx5_2
;;
2)
echo
mlx5_3
;;
3)
echo
mlx5_4
;;
4)
echo
mlx5_7
;;
5)
echo
mlx5_8
;;
6)
echo
mlx5_9
;;
7)
echo
mlx5_10
;;
esac)"'
numa_dev
:
'
"$(case
$LOCAL_RANK
in
0)
echo
3
;;
1)
echo
1
;;
2)
echo
1
;;
3)
echo
0
;;
4)
echo
7
;;
5)
echo
5
;;
6)
echo
5
;;
7)
echo
4
;;
esac)"'
bidirectional
:
false
# dist-inference:
# modes:
# - name: mpi
# proc_num: 8
# node_num: 1
# mca:
# pml: ob1
# btl: ^openib
# btl_tcp_if_exclude: lo,docker0
# coll_hcoll_enable: 0
# frameworks:
# - pytorch
# parameters:
# num_layers: 50
# num_warmup: 20
# num_steps: 100
# use_cuda_graph: true
# precision: float16
# hidden_size: 128
# input_size: 128
# batch_size: 1024
superbench/config/hygon_bw1000_summary.yaml
0 → 100644
View file @
49a4389b
# SuperBench summary rules for hygon_bw1000.yaml
#
# Usage:
# sb result summary \
# --data-file <output-dir>/results-summary.jsonl \
# --rule-file superbench/config/hygon_bw1000_summary.yaml \
# --output-file-format md \
# --output-dir <summary-output-dir>
#
# Notes:
# - This file focuses on RCCL benchmarks defined in
# superbench/config/hygon_bw1000.yaml.
# - Unmatched benchmark sections are allowed. If a benchmark was not run,
# the corresponding category in the summary will be empty.
version
:
v0.12
superbench
:
rules
:
rccl_bw_allreduce_r16_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:allreduce-r16
metrics
:
-
rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_allreduce_r8_pcie_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:allreduce-r8-pcie
metrics
:
-
rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_allreduce_r8_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:allreduce-r8
metrics
:
-
rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_alltoall_r16_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:alltoall-r16
metrics
:
-
rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment