run.sh 4.31 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/bin/bash

#module purge
#module load compiler/intel/2022.3.0
#module load mpi/intelmpi/2021.6.0
#module load  mpi/hpcx/2.12.0/gcc-8.4.1
module purge
module load mpi/hpcx/2.12.0/gcc-8.3.1

nodelist=$1
iter=$2
nhost=`cat $nodelist |wc -l`

ppn=8
test_np=$((nhost*ppn))
rm -fr list

for i in `cat $nodelist`
do                    
 echo $i slots=${ppn}  >> list
done

hostfile=list
export UCX_WARN_UNUSED_ENV_VARS=n
wh1225's avatar
wh1225 committed
25
perf_program=(all_gather_perf  all_reduce_perf   broadcast_perf  reduce_scatter_perf   reduce_perf)
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
> qz.txt
for ((current_np = test_np; current_np > 0; current_np = current_np - 8)); do
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
    echo "当前 current_np = ${current_np} (原始 test_np = ${test_np})"
    echo "======================================================================="
    
    # 确保环境变量未设置,以便 NCCL 自动选择
    unset NCCL_ALGO
    unset NCCL_PROTO
    > shuju.txt
    # echo "test_np=${current_np}" > qz.txt
    echo "test_np=${current_np}" >> qz.txt  # 改为追加模式
    test_np=${current_np}
    for((j=0;j<${#perf_program[@]};j++))
    do
        echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
        echo "测试项目: ${perf_program[$j]}"
        mpirun --allow-run-as-root -np ${test_np} -hostfile ${hostfile} --mca plm_rsh_no_tree_spawn 1 --mca plm_rsh_num_concurrent ${nhost} -mca routed_radix ${nhost} -x UCX_IB_ADDR_TYPE=ib_global -x UCX_RNDV_THRESH=16384 -x UCX_ZCOPY_THRESH=16384 -x UCX_MAX_EAGER_LANES=4 -x UCX_MAX_RNDV_LANES=4 -x LD_LIBRARY_PATH -mca btl_openib_warn_default_gid_prefix 0 -mca btl_openib_warn_no_device_params_found 0 -mca coll_hcoll_enable 0 -mca coll_hcoll_np ${test_np} --bind-to none ./single_process.sh ${perf_program[$j]}
        sleep 2
    done

    echo "### 完成测试: NCCL 自动选择策略"
    echo ""
    sleep 5

    algos=("TREE" "RING")
    protos=("LL" "SIMPLE")

    for((j=0;j<${#perf_program[@]};j++))
    do
        echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
        echo "测试项目: ${perf_program[$j]}"

        # 根据程序选择不同的算法
        if [ "${perf_program[$j]}" == "all_reduce_perf" ]; then
            algos=("TREE" "RING")
        else
            algos=("RING")
        fi

        for algo in "${algos[@]}"
        do
            for proto in "${protos[@]}"
            do
                echo "======================================================================"
                echo "### 开始测试: NCCL_ALGO=${algo}, NCCL_PROTO=${proto}"
                echo "======================================================================="
                export NCCL_ALGO=${algo}
                export NCCL_PROTO=${proto}

                mpirun --allow-run-as-root -np ${test_np} -hostfile ${hostfile} \
                    --mca plm_rsh_no_tree_spawn 1 --mca plm_rsh_num_concurrent ${nhost} \
                    -mca routed_radix ${nhost} -x NCCL_ALGO -x NCCL_PROTO \
                    -x UCX_IB_ADDR_TYPE=ib_global -x UCX_RNDV_THRESH=16384 \
                    -x UCX_ZCOPY_THRESH=16384 -x UCX_MAX_EAGER_LANES=4 -x UCX_MAX_RNDV_LANES=4 \
                    -x LD_LIBRARY_PATH -mca btl_openib_warn_default_gid_prefix 0 \
                    -mca btl_openib_warn_no_device_params_found 0 \
                    -mca coll_hcoll_enable 0 -mca coll_hcoll_np ${test_np} \
                    --bind-to none ./single_process.sh ${perf_program[$j]}

                echo "### 完成测试: NCCL_ALGO=${algo}, NCCL_PROTO=${proto}"
                echo ""
                sleep 5
            done
        done
    done

    # 遍历 NCCL_ALGO 和 NCCL_PROTO 的所有组合
    echo "所有5次测试均已完成。"
    python panduan.py   #需要更改的内容输出到qz.txt
    python parse_qz_and_modify_tuning.py qz.txt ../rccl-dtk-25.04/src/graph/tuning.cc   #针对qz.txt内容对tuning.cc进行修改
done


# #重新编译运行
101
cd ../rccl-dtk-25.04
102
103
pwd
echo "当前在 rccl 目录内"  #重新编译
104
105
106
107
108
# 加载编译环境
module purge
module load mpi/hpcx/2.12.0/gcc-8.3.1
module load compiler/dtk/25.04

109
110
111
112
113
114
115
116
117
rm -rf build
mkdir build
cd build
CXX=hipcc cmake ..
make -j




118

wh1225's avatar
wh1225 committed
119
echo "完成优化"