#!/bin/bash #module purge #module load compiler/intel/2022.3.0 #module load mpi/intelmpi/2021.6.0 #module load mpi/hpcx/2.12.0/gcc-8.4.1 module purge module load mpi/hpcx/2.12.0/gcc-8.3.1 nodelist=$1 iter=$2 nhost=`cat $nodelist |wc -l` ppn=8 test_np=$((nhost*ppn)) rm -fr list for i in `cat $nodelist` do echo $i slots=${ppn} >> list done hostfile=list export UCX_WARN_UNUSED_ENV_VARS=n perf_program=(all_gather_perf all_reduce_perf broadcast_perf reduce_scatter_perf reduce_perf) > qz.txt for ((current_np = test_np; current_np > 0; current_np = current_np - 8)); do echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "当前 current_np = ${current_np} (原始 test_np = ${test_np})" echo "=======================================================================" # 确保环境变量未设置,以便 NCCL 自动选择 unset NCCL_ALGO unset NCCL_PROTO > shuju.txt # echo "test_np=${current_np}" > qz.txt echo "test_np=${current_np}" >> qz.txt # 改为追加模式 test_np=${current_np} for((j=0;j<${#perf_program[@]};j++)) do echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "测试项目: ${perf_program[$j]}" mpirun --allow-run-as-root -np ${test_np} -hostfile ${hostfile} --mca plm_rsh_no_tree_spawn 1 --mca plm_rsh_num_concurrent ${nhost} -mca routed_radix ${nhost} -x UCX_IB_ADDR_TYPE=ib_global -x UCX_RNDV_THRESH=16384 -x UCX_ZCOPY_THRESH=16384 -x UCX_MAX_EAGER_LANES=4 -x UCX_MAX_RNDV_LANES=4 -x LD_LIBRARY_PATH -mca btl_openib_warn_default_gid_prefix 0 -mca btl_openib_warn_no_device_params_found 0 -mca coll_hcoll_enable 0 -mca coll_hcoll_np ${test_np} --bind-to none ./single_process.sh ${perf_program[$j]} sleep 2 done echo "### 完成测试: NCCL 自动选择策略" echo "" sleep 5 algos=("TREE" "RING") protos=("LL" "SIMPLE") for((j=0;j<${#perf_program[@]};j++)) do echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "测试项目: ${perf_program[$j]}" # 根据程序选择不同的算法 if [ "${perf_program[$j]}" == "all_reduce_perf" ]; then algos=("TREE" "RING") else algos=("RING") fi for algo in "${algos[@]}" do for proto in "${protos[@]}" do echo "======================================================================" echo "### 开始测试: NCCL_ALGO=${algo}, NCCL_PROTO=${proto}" echo "=======================================================================" export NCCL_ALGO=${algo} export NCCL_PROTO=${proto} mpirun --allow-run-as-root -np ${test_np} -hostfile ${hostfile} \ --mca plm_rsh_no_tree_spawn 1 --mca plm_rsh_num_concurrent ${nhost} \ -mca routed_radix ${nhost} -x NCCL_ALGO -x NCCL_PROTO \ -x UCX_IB_ADDR_TYPE=ib_global -x UCX_RNDV_THRESH=16384 \ -x UCX_ZCOPY_THRESH=16384 -x UCX_MAX_EAGER_LANES=4 -x UCX_MAX_RNDV_LANES=4 \ -x LD_LIBRARY_PATH -mca btl_openib_warn_default_gid_prefix 0 \ -mca btl_openib_warn_no_device_params_found 0 \ -mca coll_hcoll_enable 0 -mca coll_hcoll_np ${test_np} \ --bind-to none ./single_process.sh ${perf_program[$j]} echo "### 完成测试: NCCL_ALGO=${algo}, NCCL_PROTO=${proto}" echo "" sleep 5 done done done # 遍历 NCCL_ALGO 和 NCCL_PROTO 的所有组合 echo "所有5次测试均已完成。" python panduan.py #需要更改的内容输出到qz.txt python parse_qz_and_modify_tuning.py qz.txt ../rccl-dtk-25.04/src/graph/tuning.cc #针对qz.txt内容对tuning.cc进行修改 done # #重新编译运行 cd ../rccl-dtk-25.04 pwd echo "当前在 rccl 目录内" #重新编译 # 加载编译环境 module purge module load mpi/hpcx/2.12.0/gcc-8.3.1 module load compiler/dtk/25.04 rm -rf build mkdir build cd build CXX=hipcc cmake .. make -j echo "完成优化"