#!/bin/bash ulimit -n 200000 # 使用相对路径设置 RCCL 库路径(相对于 scripts 目录) export LD_LIBRARY_PATH="../rccl-dtk-25.04/build:${LD_LIBRARY_PATH}" # 使用相对路径设置测试程序目录 DIR="../rccl-test/build" #APP="${DIR}/${1} -b 8 -e 2G -f 2 -g 1" # echo $APP #APP="${DIR}/sendrecv_perf -b 8 -e 1G -f 2 -g 1" #APP="${DIR}/sendrecv_perf --minbytes=262144000 --maxbytes=262144000 --parallel_init=0 --warmup_iters=1 --stepfactor=2 --iters=10" # APP="${DIR}/alltoall_perf -b 8 -e 1G -f 2 -g 1" #APP="${DIR}/all_reduce_perf -b 8 -e 256K -f 2 -g 1" # 根据 NCCL_PROTO 调整测试的最大数据大小 # LL 协议在大数据量时性能很差,限制为 512K # if [ "$NCCL_PROTO" = "LL" ]; then # MAX_SIZE="512K" # else # MAX_SIZE="1G" # fi # APP="${DIR}/${1} -b 8 -e ${MAX_SIZE} -f 2 -g 1" # APP="${DIR}/${1} -b 8 -e 1G -f 2 -g 1" APP="${DIR}/${1} -b 8 -e 1G -f 2 -g 1" # APP="${DIR}/broadcast_perf -b 131072 -e 4194304 -i 131072 -g 1" # APP="${DIR}/all_reduce_perf -b 256M -e 278M -i 1048576 -g 1" # APP="${DIR}/all_gather_perf -b 8 -e 2M -f 2 -g 1" #APP="${DIR}/broadcast_perf -b 8 -e 16G -f 2 -g 1" # APP="${DIR}/reduce_perf -b 8 -e 4G -f 2 -g 1" #APP="${DIR}/reduce_scatter_perf -b 8 -e 4G -f 2 -g 1" # APP="${DIR}/sendrecv_perf -b 8 -e 4G -f 2 -g 1" # APP="${DIR}/alltoall_perf -b 8 -e 2G -f 2 -g 1" # APP="${DIR}/gather_perf -b 8 -e 2M -f 2 -g 1" #APP="${DIR}/scatter_perf -b 8 -e 4G -f 2 -g 1" #APP="/public/home/mssungf/rccl_learn/rccl-dtk-24.04.1.1/test2" #APP="${DIR}/gather_perf -b 8 -e 128M -f 2 -g 1" #APP="${DIR}/all_gather_perf -b 8 -e 1024M -f 2 -g 1" host=$(hostname) #++++++++++++++ nccl env +++++++++++++++++++++++ #export NCCL_GRAPH_DUMP_FILE=./graph.xml #if [ "$host"x == "a01r2n36"x ]; #then # export NCCL_GRAPH_FILE=./graph1.xml # echo "$host $NCCL_GRAPH_FILE" > /tmp/env.log #elif [ "$host"x == "a01r1n03"x ]; #then # export NCCL_GRAPH_FILE=./graph2.xml # echo "$host $NCCL_GRAPH_FILE" > /tmp/env.log #fi #export NCCL_TOPO_DUMP_FILE=./topo.xml export NCCL_TOPO_FILE=./topo-input.xml #export NCCL_TOPO_DUMP_FILE=./topo_out.xml export NCCL_GRAPH_DUMP_FILE=./graph.xml # export NCCL_GRAPH_FILE=./graph.xml export NCCL_MAX_NCHANNELS=32 export NCCL_MIN_NCHANNELS=32 export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_LEVEL=SYS #LOC #PIX #PHB #PXB #SYS #export NCCL_P2P_LEVEL=PXB #export NCCL_SHM_DISABLE=1 #export NCCL_P2P_DISABLE=1 #export HIP_VISIBLE_DEVICES=0,1,4,5 #,4,5,6,7 #0,1,2,3 #export CUDA_VISIBLE_DEVICES=0,4 #,1,2,3 #,4,5,6,7 #0,1,2,3 #export NCCL_IB_GID_INDEX=3 #export NCCL_IB_HCA=mlx5_1:1 #,mlx5_1:1,mlx5_3:1 #,mlx5_3:1 #export NCCL_IB_HCA=mlx5 #export NCCL_IB_SL=1 #export NCCL_SOCKET_IFNAME=ibs8 #ens24f0np0 #,ens24f0np1,ens52f0np0,ens52f0np1 #export NCCL_IB_DISABLE=1 #export NCCL_CROSS_NIC=1 # export NCCL_ALGO=TREE # export NCCL_ALGO=RING # export NCCL_PROTO=LL # export NCCL_PROTO=LL128 # export NCCL_PROTO=SIMPLE #export NCCL_P2P_RATIO=0.5 #export NCCL_DEBUG=INFO #export NCCL_DEBUG_SUBSYS=ALL ##export NCCL_DEBUG_FILE=./log.%h.%p export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_MIN_P2P_NCHANNELS=16 # dtk 24.04.1 sendrece improve export NCCL_NCHANNELS_PER_PEER=16 # dtk 24.04.1 sendrece improve #++++++++++++++++++++ nccl env +++++++++++++++++ ## ucx #export UCX_TLS=self,sm,rc ## local_rank=${OMPI_COMM_WORLD_LOCAL_RANK:-0} common_rank=${OMPI_COMM_WORLD_RANK:-0} # 使用 bash 内置算术,不需要 bc mod=$((local_rank / 4)) test1() { if [ $mod -eq 0 ];then #export NCCL_IB_HCA="mlx5_0:1" #export HIP_VISIBLE_DEVICES=0 #export UCX_NET_DEVICES="mlx5_4:1" numactl --cpunodebind=0 --membind=0 \ $APP elif [ $mod -eq 1 ];then # export NCCL_IB_HCA="mlx5_4:1,mlx5_5:1" # export UCX_IB_PCI_BW="mlx5_1:50Gbs" # export HIP_VISIBLE_DEVICES=1 numactl --cpunodebind=1 --membind=1 \ $APP elif [ $mod -eq 2 ];then # export UCX_NET_DEVICES="mlx5_4:1" # export UCX_IB_PCI_BW="mlx5_2:50Gbs" # export HIP_VISIBLE_DEVICES=3 numactl --cpunodebind=2 --membind=2 \ $APP elif [ $mod -eq 3 ];then # export UCX_NET_DEVICES="mlx5_5:1" # export UCX_IB_PCI_BW="mlx5_3:50Gbs" # export HIP_VISIBLE_DEVICES=3 numactl --cpunodebind=3 --membind=3 \ $APP fi } test2() { case ${local_rank} in [0]) export NCCL_IB_HCA=mlx5_0:1 #export HIP_VISIBLE_DEVICES=0,1,2,3 numactl --cpunodebind=3 --membind=3 \ $APP ;; [1]) export NCCL_IB_HCA=mlx5_4:1 #export HIP_VISIBLE_DEVICES=0,1,2,3 numactl --cpunodebind=3 --membind=3 \ $APP ;; [2]) #export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1 #export HIP_VISIBLE_DEVICES=0,1,2,3 numactl --cpunodebind=3 --membind=3 \ $APP ;; [3]) #export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1 #export HIP_VISIBLE_DEVICES=0,1,2,3 numactl --cpunodebind=3 --membind=3 \ $APP ;; [4]) #export NCCL_IB_HCA=mlx5_4:1,mlx5_5:1 #export HIP_VISIBLE_DEVICES=4,5,6,7 numactl --cpunodebind=7 --membind=7 \ $APP ;; [5]) #export NCCL_IB_HCA=mlx5_4:1,mlx5_5:1 #export HIP_VISIBLE_DEVICES=4,5,6,7 numactl --cpunodebind=7 --membind=7 \ $APP ;; [6]) #export NCCL_IB_HCA=mlx5_4:1,mlx5_5:1 #export HIP_VISIBLE_DEVICES=4,5,6,7 numactl --cpunodebind=7 --membind=7 \ $APP ;; [7]) #export NCCL_IB_HCA=mlx5_4:1,mlx5_5:1 #export HIP_VISIBLE_DEVICES=4,5,6,7 numactl --cpunodebind=7 --membind=7 \ $APP ;; esac } test3() { case ${common_rank} in [0]) #export UCX_NET_DEVICES="mlx5_0:1" #export UCX_NET_DEVICES="mlx5_0:1" export HIP_VISIBLE_DEVICES=0 numactl --cpunodebind=3 --membind=3 \ $APP ;; [1]) #export UCX_NET_DEVICES="mlx5_4:1" export HIP_VISIBLE_DEVICES=1 numactl --cpunodebind=3 --membind=3 \ $APP ;; [2]) #export UCX_NET_DEVICES="mlx5_4:1" export UCX_NET_DEVICES="mlx5_4:1" #export UCX_IB_PCI_BW="mlx5_2:50Gbs" export HIP_VISIBLE_DEVICES=0 numactl --cpunodebind=7 --membind=7 \ $APP ;; [3]) #export UCX_NET_DEVICES="mlx5_5:1" export UCX_NET_DEVICES="mlx5_5:1" #export UCX_IB_PCI_BW="mlx5_3:50Gbs" export HIP_VISIBLE_DEVICES=1 numactl --cpunodebind=7 --membind=7 \ $APP ;; esac } test1