run.sh 3.01 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCBLAS_COMPUTETYPE_FP16R=0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_ALGO=Ring
export NCCL_LAUNCH_MODE=GROUP
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MAX_NCHANNELS=16
export NCCL_MIN_NCHANNELS=16
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_P2P_LEVEL=SYS
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export SENDRECV_STREAM_WITH_COMPUTE=1
export NCCL_TOPO_FILE="/workspace/test/topo.xml"
export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
export ALLREDUCE_STREAM_WITH_COMPUTE=1
jerrrrry's avatar
jerrrrry committed
21
export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
jerrrrry's avatar
jerrrrry committed
22
export VLLM_NUMA_BIND=1
jerrrrry's avatar
jerrrrry committed
23
export VLLM_RANK0_NUMA=0
jerrrrry's avatar
jerrrrry committed
24
export VLLM_RANK1_NUMA=1
jerrrrry's avatar
jerrrrry committed
25
26
27
export VLLM_RANK2_NUMA=2
export VLLM_RANK3_NUMA=3
export VLLM_RANK4_NUMA=4
jerrrrry's avatar
jerrrrry committed
28
export VLLM_RANK5_NUMA=5
jerrrrry's avatar
jerrrrry committed
29
30
export VLLM_RANK6_NUMA=6
export VLLM_RANK7_NUMA=7
jerrrrry's avatar
jerrrrry committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

export VLLM_RPC_TIMEOUT=100000



#!/bin/bash

# 模型配置文件路径
MODELS_CONFIG="/workspace/test/models-to-test.cfg"

# 结果目录
RESULTS_DIR="/workspace/test/results"

# 读取配置文件,跳过注释和空行
while IFS= read -r line || [[ -n "$line" ]]; do
    # 跳过注释行和空行
    if [[ "$line" =~ ^# ]] || [[ -z "$line" ]]; then
        continue
    fi
    
    # 解析配置行
    IFS=';' read -ra CONFIG <<< "$line"
    
    model_name="${CONFIG[0]}"
    model_path="${CONFIG[1]}"
    tp="${CONFIG[2]}"
    batch="${CONFIG[3]//,/ }"        # 将逗号替换为空格
    prompt_tokens="${CONFIG[4]//,/ }"
    completion_tokens="${CONFIG[5]//,/ }"
    dtype="${CONFIG[6]}"
    max_model_len="${CONFIG[7]}"
    gpu_memory_utilization="${CONFIG[8]}"
    
    echo "开始测试模型: $model_name"
    echo "模型路径: $model_path"
    echo "参数配置:"
    echo "  tensor_parallel_size: $tp"
    echo "  batch_sizes: $batch"
    echo "  prompt_tokens: $prompt_tokens"
    echo "  completion_tokens: $completion_tokens"
    echo "  dtype: $dtype"
    echo "  max_model_len: $max_model_len"
    echo "  gpu_memory_utilization: $gpu_memory_utilization"
    
    # 创建模型专属结果目录
    model_result_dir="${RESULTS_DIR}/${model_name}"
    mkdir -p "$model_result_dir"
    
    # 运行基准测试
    python /workspace/test/benchmark_throughput_0.7.2.py \
        --model "$model_path" \
        --tensor-parallel-size "$tp" \
        --num-prompts $batch \
        --input-len $prompt_tokens \
        --output-len $completion_tokens \
        --dtype "$dtype" \
        --trust-remote-code \
        --max-model-len "$max_model_len" \
        --gpu-memory-utilization "$gpu_memory_utilization" \
        --output-json "${model_result_dir}/${model_name}_tp${tp}.txt" \
        2>&1 | tee "${model_result_dir}/${model_name}_tp${tp}.log"
    
    echo "完成测试模型: $model_name"
    echo "结果保存在: $model_result_dir"
    echo "----------------------------------------"
jerrrrry's avatar
jerrrrry committed
96
done < "$MODELS_CONFIG"