Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
hg-misc-tools
Commits
0286389b
Unverified
Commit
0286389b
authored
Mar 16, 2026
by
one
Committed by
GitHub
Mar 16, 2026
Browse files
[onebenchmark] Add ansible files for rccl tests (#4)
parent
9e574a55
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
405 additions
and
0 deletions
+405
-0
projects/onebenchmark/main.yml
projects/onebenchmark/main.yml
+2
-0
projects/onebenchmark/rccl-tests/mpirun_rccltest
projects/onebenchmark/rccl-tests/mpirun_rccltest
+134
-0
projects/onebenchmark/rccl-tests/run-allreduce.sh
projects/onebenchmark/rccl-tests/run-allreduce.sh
+41
-0
projects/onebenchmark/rccl-tests/run-alltoall.sh
projects/onebenchmark/rccl-tests/run-alltoall.sh
+38
-0
projects/onebenchmark/rccl-tests/run-sendrecv.sh
projects/onebenchmark/rccl-tests/run-sendrecv.sh
+27
-0
projects/onebenchmark/rccl-tests/topo-mapping-bw1000.xml
projects/onebenchmark/rccl-tests/topo-mapping-bw1000.xml
+36
-0
projects/onebenchmark/run-tests.yml
projects/onebenchmark/run-tests.yml
+43
-0
projects/onebenchmark/start-cluster.yml
projects/onebenchmark/start-cluster.yml
+77
-0
projects/onebenchmark/vars.yml
projects/onebenchmark/vars.yml
+7
-0
No files found.
projects/onebenchmark/main.yml
0 → 100644
View file @
0286389b
-
import_playbook
:
start-cluster.yml
-
import_playbook
:
run-tests.yml
projects/onebenchmark/rccl-tests/mpirun_rccltest
0 → 100644
View file @
0286389b
#!/bin/bash
set
-e
# =================================================
# Helper functions
# =================================================
help
()
{
cat
<<
EOF
RCCL Tests MPI run helper script
Usage:
$(
basename
"
$0
"
)
[OPTIONS]
OPTIONS:
-h, --help Show this help message and exit
-np Total number of processes (default: sum of per-node counts in --hosts)
-H, --hosts Comma-separated list of nodes with optional process count per node
Format: node01:8,node02:8
If count is omitted, falls back to auto-detected GPU count per node.
--tcp-iface TCP interface to use for communication (default:
${
tcp_iface
}
)
--ssh-port SSH port to use for remote connections (default:
${
ssh_port
}
)
EOF
}
# =================================================
# Global variables
# =================================================
np
=
hosts_raw
=
tcp_iface
=
p14p2
ssh_port
=
3333
rccltest_args
=()
mpi_bin
=
/opt/mpi/bin/mpirun
ompi_prefix
=
/opt/mpi
# Detect the number of GPUs per node (used as fallback when count is not specified in --hosts)
ngpu_per_node
=
$(
hy-smi
--showid
2>/dev/null |
grep
-ic
"Device ID"
)
if
[[
-z
"
${
ngpu_per_node
}
"
||
"
${
ngpu_per_node
}
"
-eq
0
]]
;
then
echo
"[WRAPPER] Failed to get the number of GPUs per node via hy-smi. Defaulting to 8."
ngpu_per_node
=
8
else
echo
"[WRAPPER] Detected
${
ngpu_per_node
}
GPUs per node."
fi
# =================================================
# Parameter parsing
# =================================================
while
[[
$#
-gt
0
]]
;
do
case
"
${
1
}
"
in
-h
|
--help
)
help
;
exit
0
;;
-np
)
np
=
${
2
}
;
shift
2
;;
-H
|
--hosts
)
hosts_raw
=
${
2
}
;
shift
2
;;
--tcp-iface
)
tcp_iface
=
${
2
}
;
shift
2
;;
--ssh-port
)
ssh_port
=
${
2
}
;
shift
2
;;
--
)
shift
;
rccltest_args+
=(
"
$@
"
)
;
break
;;
*
)
rccltest_args+
=(
"
${
1
}
"
)
;
shift
;;
esac
done
# =================================================
# Parse hosts into parallel arrays: node_names[], node_slots[]
# Input format: node01:8,node02:8 (count optional, falls back to ngpu_per_node)
# =================================================
parse_hosts
()
{
node_names
=()
node_slots
=()
IFS
=
','
read
-ra
entries
<<<
"
${
hosts_raw
}
"
for
entry
in
"
${
entries
[@]
}
"
;
do
local
name
=
"
${
entry
%%
:
*
}
"
local
slots
=
"
${
entry
##*
:
}
"
# If no ':' was present, entry == name == slots
if
[[
"
${
entry
}
"
!=
*
:
*
]]
;
then
slots
=
"
${
ngpu_per_node
}
"
fi
node_names+
=(
"
${
name
}
"
)
node_slots+
=(
"
${
slots
}
"
)
done
}
# =================================================
# Run rccl test script
# =================================================
if
[
-z
"
${
hosts_raw
}
"
]
;
then
# Run single-node test if --hosts is not set
echo
"[WRAPPER] No compute nodes specified. Running in single-node mode."
# Default np to ngpu_per_node when not set
np
=
"
${
np
:-${
ngpu_per_node
}}
"
echo
"Using np=
${
np
}
"
${
mpi_bin
}
--allow-run-as-root
\
--bind-to
none
\
--mca
pml ucx
\
--mca
osc ucx
\
--mca
btl ^vader,tcp,openib,uct
\
--mca
coll ^hcoll
\
$(
env
|
grep
-E
'^(NCCL|RCCL|UCX|HSA)_'
|
cut
-d
=
-f1
|
awk
'{print "-x", $1}'
)
\
-np
${
np
}
\
"
${
rccltest_args
[@]
}
"
else
# Multi-node mode
echo
"[WRAPPER] Running in multi-node mode."
parse_hosts
# Build MPI -H string and auto-sum np
hosts_string
=
""
np_sum
=
0
for
i
in
"
${
!node_names[@]
}
"
;
do
hosts_string+
=
"
${
node_names
[
$i
]
}
:
${
node_slots
[
$i
]
}
,"
((
np_sum +
=
node_slots[
$i
]
))
done
hosts_string
=
"
${
hosts_string
%,
}
"
# -np overrides auto-sum if explicitly provided
np
=
"
${
np
:-${
np_sum
}}
"
echo
"[WRAPPER] MPI hosts:
${
hosts_string
}
"
echo
"[WRAPPER] Total processes (np):
${
np
}
"
echo
"[WRAPPER] Using TCP interface:
${
tcp_iface
}
"
echo
"[WRAPPER] Using SSH port:
${
ssh_port
}
"
${
mpi_bin
}
--allow-run-as-root
\
--prefix
${
ompi_prefix
}
\
--bind-to
none
\
--mca
pml ucx
\
--mca
btl_tcp_if_include
${
tcp_iface
}
\
--mca
plm_rsh_args
"-p
${
ssh_port
}
"
\
$(
env
|
grep
-E
'^(NCCL|RCCL|UCX|HSA|HIP)_'
|
cut
-d
=
-f1
|
awk
'{print "-x", $1}'
)
\
-x
ROCM_PATH
-x
PATH
-x
LD_LIBRARY_PATH
\
-np
${
np
}
\
-H
${
hosts_string
}
\
"
${
rccltest_args
[@]
}
"
fi
projects/onebenchmark/rccl-tests/run-allreduce.sh
0 → 100644
View file @
0286389b
#!/bin/bash
set
-e
unset
UCX_HOME
# export UCX_LOG_LEVEL=fatal
export
NCCL_TOPO_DUMP_FILE
=
${
PWD
}
/topo-generated.xml
export
NCCL_GRAPH_DUMP_FILE
=
${
PWD
}
/graph-generated.xml
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
#export RCCL_SDMA_COPY_ENABLE=1
#export RCCL_SDMA_LINK_MODE=0
# PCIe混合链路
# export NCCL_SIMPLE_CHANNELS=32
# export RCCL_P2P_XHCL_CHANNEL_NUM=31
# export RCCL_COLL_XHCL_CHANNEL_NUM=28
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_SOCKET_IFNAME
=
p14p2
export
NCCL_IB_HCA
=
"=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10"
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_GDR_READ
=
1
# export NCCL_ALGO=Ring
# export NCCL_PROTO=Simple
export
NCCL_SIMPLE_CHANNELS
=
32
unset
NCCL_NCHANNELS_PER_PEER
export
NCCL_TOPO_MAPPING_FILE
=
${
PWD
}
/topo-mapping-bw1000.xml
# export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可
# export NCCL_GRAPH_FILE=${PWD}/graph-16r-allreduce.xml
mpirun_rccltest
-np
2
\
all_reduce_perf
-b
4
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
4
\
all_reduce_perf
-b
4
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
8
\
all_reduce_perf
-b
4
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
16
-H
node01,node02
--ssh-port
${
SSH_PORT
}
\
all_reduce_perf
-b
4
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
32
-H
node01,node02,node03,node04
--ssh-port
${
SSH_PORT
}
\
all_reduce_perf
-b
4
-e
16G
-f
2
-w
3
-g
1
projects/onebenchmark/rccl-tests/run-alltoall.sh
0 → 100644
View file @
0286389b
#!/bin/bash
set
-e
unset
UCX_HOME
# export UCX_LOG_LEVEL=fatal
export
NCCL_TOPO_DUMP_FILE
=
${
PWD
}
/topo-generated.xml
export
NCCL_GRAPH_DUMP_FILE
=
${
PWD
}
/graph-generated.xml
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
#export RCCL_SDMA_COPY_ENABLE=1
#export RCCL_SDMA_LINK_MODE=0
# PCIe混合链路
# export NCCL_SIMPLE_CHANNELS=32
# export RCCL_P2P_XHCL_CHANNEL_NUM=31
# export RCCL_COLL_XHCL_CHANNEL_NUM=28
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_SOCKET_IFNAME
=
p14p2
export
NCCL_IB_HCA
=
"=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10"
export
NCCL_P2P_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
PHB
export
NCCL_NET_GDR_READ
=
1
unset
NCCL_NCHANNELS_PER_PEER
export
NCCL_TOPO_MAPPING_FILE
=
${
PWD
}
/topo-mapping-bw1000.xml
# export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可
mpirun_rccltest
-np
2
\
alltoall_perf
-b
32
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
4
\
alltoall_perf
-b
64
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
8
\
alltoall_perf
-b
128
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
16
-H
node01,node02
--ssh-port
${
SSH_PORT
}
\
alltoall_perf
-b
256
-e
16G
-f
2
-w
3
-g
1
mpirun_rccltest
-np
32
-H
node01,node02,node03,node04
--ssh-port
${
SSH_PORT
}
\
alltoall_perf
-b
512
-e
16G
-f
2
-w
3
-g
1
projects/onebenchmark/rccl-tests/run-sendrecv.sh
0 → 100644
View file @
0286389b
#!/bin/bash
set
-e
unset
UCX_HOME
export
NCCL_TOPO_DUMP_FILE
=
${
PWD
}
/topo-generated.xml
export
NCCL_GRAPH_DUMP_FILE
=
${
PWD
}
/graph-generated.xml
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_SOCKET_IFNAME
=
p14p2
export
NCCL_IB_HCA
=
"=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10"
export
NCCL_P2P_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
PHB
export
NCCL_NET_GDR_READ
=
1
unset
NCCL_NCHANNELS_PER_PEER
export
NCCL_TOPO_MAPPING_FILE
=
${
PWD
}
/topo-mapping-bw1000.xml
# export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可
for
g
in
{
0..7
}
;
do
echo
echo
"Running with GPU
${
g
}
"
export
HIP_VISIBLE_DEVICES
=
${
g
}
mpirun_rccltest
-np
2
-H
node01:1,node02:1
--ssh-port
${
SSH_PORT
}
\
sendrecv_perf
-b
2G
-e
2G
-f
2
-w
3
-g
1
echo
done
projects/onebenchmark/rccl-tests/topo-mapping-bw1000.xml
0 → 100644
View file @
0286389b
<system
version=
"2"
>
<group
name=
"gfx936_8_x86_64_HygonGenuine_mlx5_11_InfiniBand_40-200-200-200-200-40-5-200-200-200-200_1_8_1"
>
<cpu
numaid=
"3"
>
<pci>
<nic
id=
"mlx5_1"
/>
<nic
id=
"mlx5_2"
/>
<gpu
dev=
"0"
/>
<gpu
dev=
"1"
/>
</pci>
</cpu>
<cpu
numaid=
"0"
>
<pci>
<nic
id=
"mlx5_3"
/>
<nic
id=
"mlx5_4"
/>
<gpu
dev=
"2"
/>
<gpu
dev=
"3"
/>
</pci>
</cpu>
<cpu
numaid=
"7"
>
<pci>
<nic
id=
"mlx5_7"
/>
<nic
id=
"mlx5_8"
/>
<gpu
dev=
"4"
/>
<gpu
dev=
"5"
/>
</pci>
</cpu>
<cpu
numaid=
"4"
>
<pci>
<nic
id=
"mlx5_9"
/>
<nic
id=
"mlx5_10"
/>
<gpu
dev=
"6"
/>
<gpu
dev=
"7"
/>
</pci>
</cpu>
</group>
</system>
projects/onebenchmark/run-tests.yml
0 → 100644
View file @
0286389b
-
name
:
Run RCCL Tests
hosts
:
all
become
:
yes
any_errors_fatal
:
true
vars_files
:
-
vars.yml
tasks
:
-
name
:
Create temp dir on remote host
ansible.builtin.file
:
path
:
/tmp/{{ container_name }}/rccl-tests
state
:
directory
-
name
:
Sync files to remote host
ansible.posix.synchronize
:
src
:
"
{{
playbook_dir
}}/rccl-tests/"
dest
:
/tmp/{{ container_name }}/rccl-tests/
-
name
:
Copy files into container
ansible.builtin.shell
:
|
docker cp /tmp/{{ container_name }}/rccl-tests/. {{ container_name }}:{{ work_dir }}
docker cp /tmp/{{ container_name }}/rccl-tests/mpirun_rccltest {{ container_name }}:/usr/local/bin/mpirun_rccltest
docker exec {{ container_name }} chmod +x /usr/local/bin/mpirun_rccltest
-
name
:
Run tests
ansible.builtin.shell
:
|
docker exec -e SSH_PORT={{ ssh_port | string }} \
-w {{ work_dir }} \
{{ container_name }} \
bash {{ work_dir }}/{{ test_script }} 2>&1
delegate_to
:
"
{{
groups['all'][0]
}}"
run_once
:
true
register
:
result
-
name
:
Show output
ansible.builtin.debug
:
msg
:
"
{{
result.stdout_lines
}}"
run_once
:
true
-
name
:
Cleanup temp dir
ansible.builtin.file
:
path
:
/tmp/{{ container_name }}
state
:
absent
projects/onebenchmark/start-cluster.yml
0 → 100644
View file @
0286389b
-
name
:
Start docker cluster
hosts
:
all
become
:
yes
any_errors_fatal
:
true
vars_files
:
-
vars.yml
vars
:
rccl_tests_install_dir
:
/workspace/rccl-tests
tasks
:
-
name
:
Run cluster container
community.docker.docker_container
:
name
:
"
{{
container_name
}}"
image
:
"
{{
image_name
}}"
recreate
:
"
{{
force_rm
|
default(false)
|
bool
}}"
state
:
started
hostname
:
"
{{
inventory_hostname
}}"
network_mode
:
host
ipc_mode
:
host
privileged
:
yes
shm_size
:
512G
volumes
:
-
/opt/hyhal:/opt/hyhal:ro
-
/root/.ssh:/root/.ssh
working_dir
:
/workspace
etc_hosts
:
"
{{
dict(groups['all']
|
zip(groups['all']
|
map('extract',
hostvars,
['ansible_facts',
'default_ipv4',
'address'])))
}}"
command
:
>
bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p {{ ssh_port }}; sleep infinity"
-
name
:
Check if rccl-tests is already installed
community.docker.docker_container_exec
:
container
:
"
{{
container_name
}}"
command
:
test -f /usr/local/bin/all_reduce_perf
register
:
rccl_installed
failed_when
:
false
changed_when
:
false
-
name
:
Install rccl-tests
when
:
rccl_installed.rc != 0 or (force_reinstall | default(false) | bool)
block
:
-
name
:
Clone rccl-tests
community.docker.docker_container_exec
:
container
:
"
{{
container_name
}}"
command
:
>
bash -c "
rm -rf {{ rccl_tests_install_dir }} &&
git clone https://github.com/ROCm/rccl-tests.git -b master {{ rccl_tests_install_dir }}
"
-
name
:
Build rccl-tests
community.docker.docker_container_exec
:
container
:
"
{{
container_name
}}"
command
:
>
bash -c "
cd {{ rccl_tests_install_dir }} &&
ln -sf $(which hipify-perl) /opt/dtk/bin/hipify-perl &&
./install.sh --mpi --mpi_home /opt/mpi \
--rocm_home /opt/dtk \
--rccl_home /opt/dtk/rccl \
--hip_compiler hipcc \
--gpu_targets {{ gpu_target }}
"
-
name
:
Copy rccl-tests binaries to global PATH
community.docker.docker_container_exec
:
container
:
"
{{
container_name
}}"
command
:
>
bash -c "
cp {{ rccl_tests_install_dir }}/build/*_perf /usr/local/bin/ &&
chmod +x /usr/local/bin/*_perf
"
-
name
:
Verify rccl-tests installation
community.docker.docker_container_exec
:
container
:
"
{{
container_name
}}"
command
:
all_reduce_perf --help
changed_when
:
false
projects/onebenchmark/vars.yml
0 → 100644
View file @
0286389b
# image_name: harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
image_name
:
onebenchmark-dtk26.04
container_name
:
benchmark-dtk26-0316
ssh_port
:
3433
gpu_target
:
gfx936
work_dir
:
/workspace/rccl-tests
test_script
:
run-sendrecv.sh
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment