run-allreduce.sh 1.32 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
set -e

SSH_PORT=3333

unset UCX_HOME
# export UCX_LOG_LEVEL=fatal

export NCCL_TOPO_DUMP_FILE=${PWD}/topo-generated.xml
export NCCL_GRAPH_DUMP_FILE=${PWD}/graph-generated.xml
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL

#export RCCL_SDMA_COPY_ENABLE=1
#export RCCL_SDMA_LINK_MODE=0

# PCIe混合链路
# export NCCL_SIMPLE_CHANNELS=32
# export RCCL_P2P_XHCL_CHANNEL_NUM=31
# export RCCL_COLL_XHCL_CHANNEL_NUM=28

export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_SOCKET_IFNAME=p14p2
export NCCL_IB_HCA="=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10"
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=1
# export NCCL_ALGO=Ring
# export NCCL_PROTO=Simple
export NCCL_SIMPLE_CHANNELS=32
unset NCCL_NCHANNELS_PER_PEER
export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml
# export NCCL_GRAPH_FILE=${PWD}/graph-16r-allreduce.xml
./mpirun_rccltest -np 2 \
  ./build/all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1
./mpirun_rccltest -np 4 \
  ./build/all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1
./mpirun_rccltest -np 8 \
  ./build/all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1
./mpirun_rccltest -np 16 -H node01,node02 --ssh-port ${SSH_PORT} \
  ./build/all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1
./mpirun_rccltest -np 32 -H node01,node02,node03,node04 --ssh-port ${SSH_PORT} \
  ./build/all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1