#!/bin/bash set -e unset UCX_HOME # export UCX_LOG_LEVEL=fatal export NCCL_TOPO_DUMP_FILE=${PWD}/topo-generated.xml export NCCL_GRAPH_DUMP_FILE=${PWD}/graph-generated.xml # export NCCL_DEBUG=INFO # export NCCL_DEBUG_SUBSYS=ALL #export RCCL_SDMA_COPY_ENABLE=1 #export RCCL_SDMA_LINK_MODE=0 # PCIe混合链路 # export NCCL_SIMPLE_CHANNELS=32 # export RCCL_P2P_XHCL_CHANNEL_NUM=31 # export RCCL_COLL_XHCL_CHANNEL_NUM=28 export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_SOCKET_IFNAME=p14p2 export NCCL_IB_HCA="=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10" export NCCL_NET_GDR_LEVEL=SYS export NCCL_NET_GDR_READ=1 # export NCCL_ALGO=Ring # export NCCL_PROTO=Simple export NCCL_SIMPLE_CHANNELS=32 unset NCCL_NCHANNELS_PER_PEER export NCCL_TOPO_MAPPING_FILE=${PWD}/topo-mapping-bw1000.xml # export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可 # export NCCL_GRAPH_FILE=${PWD}/graph-16r-allreduce.xml mpirun_rccltest -np 2 \ all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1 mpirun_rccltest -np 4 \ all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1 mpirun_rccltest -np 8 \ all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1 mpirun_rccltest -np 16 -H node01,node02 --ssh-port ${SSH_PORT} \ all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1 mpirun_rccltest -np 32 -H node01,node02,node03,node04 --ssh-port ${SSH_PORT} \ all_reduce_perf -b 4 -e 16G -f 2 -w 3 -g 1