test_start.sh 3.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/bash

for para in $*
do
    if [[ $para == --launch_with_binding* ]];then
        launch_with_binding=${para#*=}
    elif [[ $para == --testmode* ]];then
        testmode=${para#*=}
    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
done

# default env
DIST_URL=${1}
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE

# =============================================================================
# 调试输出(确认环境变量传递正确)
# =============================================================================
if [ "$RANK" -eq 0 ]; then
    echo "=== DeepEP Test Start ==="
    echo "Test mode: ${testmode:-internode}"
    echo "World size: $WORLD_SIZE"
    echo "Master: $DIST_URL:$DIST_PORT"
    echo "PYTHONPATH: $PYTHONPATH"
    echo "TEST_DIR: $TEST_DIR"
    echo "ROCSHMEM_TOPO_FILE_FORCE: $ROCSHMEM_TOPO_FILE_FORCE"
    echo "NCCL_PLUGIN: ${NCCL_NET_PLUGIN:-none}"
    echo "NCCL_IB_HCA: ${NCCL_IB_HCA:-auto}"
    echo "HSA_FORCE_FINE_GRAIN_PCIE: ${HSA_FORCE_FINE_GRAIN_PCIE:-not set}"
fi

DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
    --local-rank ${LOCAL_RANK}
    --dist-url tcp://${DIST_URL}:${DIST_PORT}
)

TEST_BASE_ARGS=(
    --hidden 7168
    --num-experts 256
    --num-topk 8
)

# 三种模式的 APP 定义
case ${testmode} in
    intranode)
        # 节点内测试
        INTRANODE_ARGS=(
            "${TEST_BASE_ARGS[@]}"
            # intranode 特定参数:
            --num-tokens 4096
        )
        APP="python3 -u ${TEST_DIR}/test_intranode.py \
            ${DISTRIBUTED_ARGS[@]} \
            ${INTRANODE_ARGS[@]} \
            "
        ;;
    lowlatency)
        # 低延迟测试
        LOWLATENCY_ARGS=(
            "${TEST_BASE_ARGS[@]}"
            # lowlatency 特定参数:
            --num-tokens 128
            # --pressure-test
        )
        APP="python3 -u ${TEST_DIR}/test_low_latency.py \
            ${DISTRIBUTED_ARGS[@]} \
            ${LOWLATENCY_ARGS[@]} \
            "
        ;;
    internode|*)
        # 跨节点测试(默认)
        INTERNODE_ARGS=(
            "${TEST_BASE_ARGS[@]}"
            # internode 特定参数:
            --num-tokens 4096
            # --test-ll-compatibility
        )
        APP="python3 -u ${TEST_DIR}/test_internode.py \
            ${DISTRIBUTED_ARGS[@]} \
            ${INTERNODE_ARGS[@]} \
            "
        ;;
esac

###############################################################################

TORCH_PROFIE_ARGS=(
    --profile
    --profile-ranks 0 1 2 3 4 6 8 32
    --profile-step-start 3
    --profile-step-end 4
    --profile-dir torch_prof_aibenchmark_8nodes_tp4-pp2-ep8-etp2-cp1-vp2
    --use-pytorch-profiler
)

HIP_PROFIE_ARGS=(
    --profile
    --profile-ranks 0 1 2 3 4 6 8 32
    --profile-step-start 4
    --profile-step-end 5
    --use-hip-profiler
)

if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
    mkdir -p hip_prof_data
    APP+=" ${HIP_PROFIE_ARGS[@]}"
    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi

###############################################################################

echo "launch_with_binding=${launch_with_binding}, APP=${APP}"

#for hygon cpu
${launch_with_binding} ${LOCAL_RANK} ${APP}