cpp_benchmark.sh 4.55 KB
Newer Older
zhuwenwen's avatar
zhuwenwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# $1: TP size
# $2: PP size

export NVIDIA_TF32_OVERRIDE=0
tensor_para_size=$1
pipeline_para_size=$2
total_gpu_count=$(echo "scale=2; ${tensor_para_size} * ${pipeline_para_size} " | bc)

vocab_size=51200

logdir="gpt-TP${tensor_para_size}-PP${pipeline_para_size}-log"
if [ ! -f ${logdir} ]; then
    mkdir ${logdir} -p
fi

all_log="${logdir}/all-log.log"

echo -e "| model size | Batch Size | Input length | Output length | Decode value | Precision | FT latency (ms) |" > $all_log
echo -e "|:----------:|:----------:|:------------:|:-------------:|:------------:|:---------:|:---------------:|" >> $all_log

cat /proc/cpuinfo > ${logdir}/cpuinfo.txt
zhuwenwen's avatar
zhuwenwen committed
36
rocm-smi > ${logdir}/gpuinfo.txt
zhuwenwen's avatar
zhuwenwen committed
37

zhuwenwen's avatar
zhuwenwen committed
38
39
# for model_size in "345m" "5b";
for model_size in "345m";
zhuwenwen's avatar
zhuwenwen committed
40
41
42
43
44
45
do
    if [ "$model_size" = "345m" ]; then
        head_num=16
        size_per_head=64
        inter_size=$(echo "scale=2; $head_num * ${size_per_head} * 4 " | bc)
        num_layer=24
zhuwenwen's avatar
zhuwenwen committed
46
47
48
49
50
    # elif [ "$model_size" = "5b" ]; then
    #     head_num=32
    #     size_per_head=128
    #     inter_size=$(echo "scale=2; $head_num * ${size_per_head} * 4 " | bc)
    #     num_layer=24
zhuwenwen's avatar
zhuwenwen committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
    fi

for decode_type in "beamsearch" "sampling";
do

    if [ "$decode_type" = "beamsearch" ]; then
        decode_values=(4)
    elif [ "$decode_type" = "sampling" ]; then
        decode_values=(4 0.5)
    fi

for request_batch_size in 1 4 16;
do
for input_length in 60;
do
for request_output_len in 80;
do
for decode_value in ${decode_values[@]};
do

if [ "$decode_type" = "beamsearch" ]; then
    beam_width=$decode_value
    topk=0
    topp=0.0
elif [ "$decode_type" = "sampling" ]; then
    beam_width=1
    if [[ $decode_value == +([[:digit:]]) ]]; then
        topk=$decode_value
        topp=0.0
    else
        topk=0
        topp=$decode_value
    fi
fi

tmp_log=${logdir}/batchsize-${request_batch_size}-decode_value-${decode_value}-${input_length}-${request_output_len}-${decode_type}-${decode_value}.log

python ../examples/pytorch/gpt/utils/generate_start_ids.py --max_batch_size ${request_batch_size} --max_input_length ${input_length}
./bin/gpt_gemm ${request_batch_size} ${beam_width} ${input_length} ${head_num} ${size_per_head} ${inter_size} ${vocab_size} 1 ${tensor_para_size}
python ../examples/pytorch/gpt/utils/generate_gpt_config.py \
                                        --max_seq_len 1024 \
                                        --beam_width ${beam_width} \
                                        --head_num ${head_num} \
                                        --size_per_head ${size_per_head} \
                                        --inter_size ${inter_size} \
                                        --num_layer ${num_layer} \
zhuwenwen's avatar
zhuwenwen committed
97
                                        -v 50304 \
zhuwenwen's avatar
zhuwenwen committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
                                        -d fp16 \
                                        -topk ${topk} \
                                        -topp ${topp} \
                                        --tensor_para_size ${tensor_para_size} \
                                        --pipeline_para_size ${pipeline_para_size} \
                                        -request_batch_size ${request_batch_size} \
                                        --request_output_len ${request_output_len}
mpirun -n ${total_gpu_count} --allow-run-as-root ./bin/multi_gpu_gpt_example .tmp.config.ini 2>&1 | tee ${tmp_log}
ft_latency=`tail -n 1 ${tmp_log} | head -n 1 | awk '{print $17}'`
echo "" | awk -v ft_latency=$ft_latency \
            -v batch_size=$request_batch_size \
            -v input_length=${input_length} -v request_output_len="$request_output_len" \
            -v model_size=${model_size} -v decode_value="$decode_value" -v decode_type="$decode_type" \
            '{printf "| %5s | %3d | %4d | %4d | %10s %5s | FP16 | %7.2f |\n", model_size, batch_size, input_length, request_output_len,
              decode_type, decode_value, ft_latency}' >> $all_log

rm .tmp.config.ini

done # decode_values
done # request_output_len
done # input_length
done # batch_size
done # decode_type
done # model_size