#!/bin/bash set -e unset UCX_HOME export NCCL_TOPO_DUMP_FILE=${PWD}/topo-generated.xml export NCCL_GRAPH_DUMP_FILE=${PWD}/graph-generated.xml # export NCCL_DEBUG=INFO # export NCCL_DEBUG_SUBSYS=ALL export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_SOCKET_IFNAME=p14p2 export NCCL_IB_HCA="=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10" export NCCL_P2P_LEVEL=SYS export NCCL_NET_GDR_LEVEL=PHB export NCCL_NET_GDR_READ=1 unset NCCL_NCHANNELS_PER_PEER export NCCL_TOPO_MAPPING_FILE=${PWD}/topo-mapping-bw1000.xml # export NCCL_TOPO_FILE=${PWD}/topo-gdr-bw1000.xml # topo 和 topo mapping 二选一即可 for g in {0..7}; do echo echo "Running with GPU ${g}" export HIP_VISIBLE_DEVICES=${g} mpirun_rccltest -np 2 -H node01:1,node02:1 --ssh-port ${SSH_PORT} \ sendrecv_perf -b 2G -e 2G -f 2 -w 3 -g 1 echo done