#!/bin/bash #export NCCL_GRAPH_DUMP_FILE=./graph-dump.xml #export NCCL_TOPO_DUMP_FILE=./topo-0515-dump.xml #export NCCL_TOPO_DUMP_FILE=./topo-wkx-exp.xml #export NCCL_MAX_NCHANNELS=24 export NCCL_MIN_NCHANNELS=32 export NCCL_MIN_P2P_NCHANNELS=32 #export NCCL_P2P_LEVEL=7 export NCCL_ALGO=Ring #export NCCL_MIN_P2P_NCHANNELS=24 #export NCCL_NCHANNELS_PER_PEER=24 export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_SOCKET_IFNAME=p14p2 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #export NCCL_TOPO_FILE=./topo-0507-115-update.xml export NCCL_P2P_LEVEL=SYS export NCCL_TOPO_FILE=./topo-BW-0520.xml export RCCL_SDMA_COUNT_ENABLE=1 export RCCL_SDMA_COPY_ENABLE=0 export RCCL_COLL_XHCL_CHANNEL_NUM=28 \ #export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 #export NCCL_TOPO_FILE=null #export NCCL_GRAPH_FILE=./graph_debug.xml #export NCCL_DEBUG=TRACE #export NCCL_SHM_DISABLE=1 #export NCCL_P2P_DISABLE=1 #export NCCL_GRAPH_FILE=./graph_debug.xml #export NCCL_IB_HCA=mlx5_7:1,mlx5_9:1,mlx5_10:1 # MLX5_10 没流量 #export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 # MLX5_10 没流量 #./build/alltoall_perf -b 2 -e 2G -f 2 -g 1 ./build/all_reduce_perf -b 256M -e 256M -f 2 -g 8 #./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1 #mpirun -np 16 -H master:1,node1:8 --allow-run-as-root -x NCCL_TOPO_FILE=./topo-0507-115-real.xml --mca plm_rsh_args "-p 2222" -x NCCL_MAX_NCHANNELS=20 -x NCCL_MIN_NCHANNELS=20 -x NCCL_P2P_LEVEL=SYS -x NCCL_ALGO=Ring -x NCCL_MIN_P2P_NCHANNELS=20 -x NCCL_NCHANNELS_PER_PEER=20 -x HSA_FORCE_FINE_GRAIN_PCIE=1 -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1 #mpirun -np 8 --allow-run-as-root -x NCCL_TOPO_FILE=./topo-0507-115-real.xml -x NCCL_MAX_NCHANNELS=20 -x NCCL_MIN_NCHANNELS=20 -x NCCL_P2P_LEVEL=SYS -x NCCL_ALGO=Ring -x HSA_FORCE_FINE_GRAIN_PCIE=1 -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1 #-x NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 \ #-x NCCL_TOPO_FILE=/data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml \ #-x NCCL_IB_QP_PER_CONNECTION=4 \ #-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \ # -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \ # --mca plm rsh \ # --mca plm_rsh_agent ssh \ # -x NCCL_GRAPH_FILE=./topo.xml \ # -x NCCL_ALGO=tree,ring \ # --mca plm_rsh_args "-2 -o StrictHostKeyChecking=no" \ # 使用 SSH 通信 # -x NCCL_DEBUG=INFO # 输出调试日志,定位问题 # -x NCCL_IB_GID_INDEX=3 # 若 IB 多子网,指定 GID 索引