Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
49a4389b
Commit
49a4389b
authored
Apr 02, 2026
by
one
Browse files
Add bw1000 config files (beta)
parent
53e0e494
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
408 additions
and
0 deletions
+408
-0
superbench/config/hygon_bw1000.yaml
superbench/config/hygon_bw1000.yaml
+363
-0
superbench/config/hygon_bw1000_summary.yaml
superbench/config/hygon_bw1000_summary.yaml
+45
-0
No files found.
superbench/config/hygon_bw1000.yaml
0 → 100644
View file @
49a4389b
# SuperBench Config
version
:
v0.12
superbench
:
enable
:
null
monitor
:
enable
:
false
var
:
default_local_mode
:
&default_local_mode
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
parallel
:
yes
default_pytorch_mode
:
&default_pytorch_mode
enable
:
false
modes
:
-
name
:
torch.distributed
proc_num
:
8
node_num
:
1
frameworks
:
-
pytorch
common_model_config
:
&common_model_config
model_ddp_parameter
:
&model_ddp_param
duration
:
0
num_warmup
:
128
num_steps
:
512
sample_count
:
8192
batch_size
:
128
precision
:
[
float32
,
float16
]
model_action
:
[
train
]
pin_memory
:
yes
num_workers
:
0
benchmarks
:
kernel-launch
:
<<
:
*default_local_mode
gemm-flops
:
<<
:
*default_local_mode
parameters
:
m
:
7680
n
:
8192
k
:
8192
hipblaslt-gemm
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
parallel
:
yes
parameters
:
in_types
:
[
"
fp32"
,
"
fp16"
,
"
bf16"
]
tolerant_fail
:
yes
num_warmup
:
100
num_steps
:
1000
shapes
:
-
4096,4096,4096
-
8192,8192,8192
-
16384,16384,16384
gpu-stream
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
parallel
:
yes
parameters
:
array_size
:
268435456
num_loops
:
100
precision
:
double
rccl-bw:allreduce-r16:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
2
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
allreduce
rccl-bw:allreduce-r8-pcie:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
1
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
NCCL_BUFFSIZE
:
4194304
NCCL_SIMPLE_CHANNELS
:
32
RCCL_P2P_XHCL_CHANNEL_NUM
:
31
RCCL_COLL_XHCL_CHANNEL_NUM
:
28
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
allreduce
rccl-bw:allreduce-r8:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
1
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
NCCL_BUFFSIZE
:
4194304
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
allreduce
rccl-bw:alltoall-r16:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
2
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
NCCL_SOCKET_IFNAME
:
p14p2
NCCL_NET_GDR_LEVEL
:
PHB
NCCL_NET_GDR_READ
:
1
parameters
:
maxbytes
:
16G
ngpus
:
1
operation
:
alltoall
gpu-hpcg:r32:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
4
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
4
npy
:
4
npz
:
2
gpu-hpcg:r16:
enable
:
true
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
2
bind_to
:
none
mca
:
pml
:
ucx
btl
:
^openib
btl_tcp_if_exclude
:
lo,docker0
coll_hcoll_enable
:
0
env
:
ROCM_PATH
:
/opt/dtk
HSA_FORCE_FINE_GRAIN_PCIE
:
1
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
0
npx
:
4
npy
:
2
npz
:
2
gpu-hpcg:r8:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
2
npy
:
2
npz
:
2
gpu-hpcg:r4:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
4
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
2
npy
:
2
npz
:
1
gpu-hpcg:r2:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
2
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
2
npy
:
1
npz
:
1
gpu-hpcg:r1:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
1
node_num
:
1
bind_to
:
none
parameters
:
nx
:
560
ny
:
280
nz
:
280
rt
:
60
npx
:
1
npy
:
1
npz
:
1
cpu-memory-bw-latency
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
1
parallel
:
no
parameters
:
tests
:
-
bandwidth_matrix
-
latency_matrix
-
max_bandwidth
mem-bw
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
parallel
:
no
ib-loopback
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
16
prefix
:
PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
parallel
:
no
parameters
:
msg_size
:
8388608
disk-benchmark
:
enable
:
false
modes
:
-
name
:
local
proc_num
:
1
parallel
:
no
parameters
:
block_devices
:
[]
gpu-copy-bw:correctness:
enable
:
false
modes
:
-
name
:
local
parallel
:
no
parameters
:
mem_type
:
[
htod
,
dtoh
,
dtod
,
one_to_all
,
all_to_one
,
all_to_all
]
copy_type
:
[
sm
,
dma
]
size
:
4096
num_warm_up
:
0
num_loops
:
1
check_data
:
true
gpu-copy-bw:perf:
enable
:
false
modes
:
-
name
:
local
parallel
:
no
parameters
:
mem_type
:
[
htod
,
dtoh
,
dtod
,
one_to_all
,
all_to_one
,
all_to_all
]
copy_type
:
[
sm
,
dma
]
ib-traffic
:
enable
:
false
modes
:
-
name
:
mpi
proc_num
:
8
parameters
:
command
:
-
ib_write_bw
-
ib_write_lat
direction
:
-
gpu-to-gpu
pattern
:
one-to-one
msg_size
:
8388608
iters
:
5000
timeout
:
120
gpu_dev
:
$LOCAL_RANK
ib_dev
:
'
"$(case
$LOCAL_RANK
in
0)
echo
mlx5_1
;;
1)
echo
mlx5_2
;;
2)
echo
mlx5_3
;;
3)
echo
mlx5_4
;;
4)
echo
mlx5_7
;;
5)
echo
mlx5_8
;;
6)
echo
mlx5_9
;;
7)
echo
mlx5_10
;;
esac)"'
numa_dev
:
'
"$(case
$LOCAL_RANK
in
0)
echo
3
;;
1)
echo
1
;;
2)
echo
1
;;
3)
echo
0
;;
4)
echo
7
;;
5)
echo
5
;;
6)
echo
5
;;
7)
echo
4
;;
esac)"'
bidirectional
:
false
# dist-inference:
# modes:
# - name: mpi
# proc_num: 8
# node_num: 1
# mca:
# pml: ob1
# btl: ^openib
# btl_tcp_if_exclude: lo,docker0
# coll_hcoll_enable: 0
# frameworks:
# - pytorch
# parameters:
# num_layers: 50
# num_warmup: 20
# num_steps: 100
# use_cuda_graph: true
# precision: float16
# hidden_size: 128
# input_size: 128
# batch_size: 1024
superbench/config/hygon_bw1000_summary.yaml
0 → 100644
View file @
49a4389b
# SuperBench summary rules for hygon_bw1000.yaml
#
# Usage:
# sb result summary \
# --data-file <output-dir>/results-summary.jsonl \
# --rule-file superbench/config/hygon_bw1000_summary.yaml \
# --output-file-format md \
# --output-dir <summary-output-dir>
#
# Notes:
# - This file focuses on RCCL benchmarks defined in
# superbench/config/hygon_bw1000.yaml.
# - Unmatched benchmark sections are allowed. If a benchmark was not run,
# the corresponding category in the summary will be empty.
version
:
v0.12
superbench
:
rules
:
rccl_bw_allreduce_r16_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:allreduce-r16
metrics
:
-
rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_allreduce_r8_pcie_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:allreduce-r8-pcie
metrics
:
-
rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_allreduce_r8_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:allreduce-r8
metrics
:
-
rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
rccl_bw_alltoall_r16_bandwidth
:
statistics
:
mean
categories
:
RCCL rccl-bw:alltoall-r16
metrics
:
-
rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-
rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment