# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # $1: TP size # $2: PP size export NVIDIA_TF32_OVERRIDE=0 tensor_para_size=$1 pipeline_para_size=$2 total_gpu_count=$(echo "scale=2; ${tensor_para_size} * ${pipeline_para_size} " | bc) vocab_size=51200 logdir="gpt-TP${tensor_para_size}-PP${pipeline_para_size}-log" if [ ! -f ${logdir} ]; then mkdir ${logdir} -p fi all_log="${logdir}/all-log.log" echo -e "| model size | Batch Size | Input length | Output length | Decode value | Precision | FT latency (ms) |" > $all_log echo -e "|:----------:|:----------:|:------------:|:-------------:|:------------:|:---------:|:---------------:|" >> $all_log cat /proc/cpuinfo > ${logdir}/cpuinfo.txt rocm-smi > ${logdir}/gpuinfo.txt # for model_size in "345m" "5b"; for model_size in "345m"; do if [ "$model_size" = "345m" ]; then head_num=16 size_per_head=64 inter_size=$(echo "scale=2; $head_num * ${size_per_head} * 4 " | bc) num_layer=24 # elif [ "$model_size" = "5b" ]; then # head_num=32 # size_per_head=128 # inter_size=$(echo "scale=2; $head_num * ${size_per_head} * 4 " | bc) # num_layer=24 fi for decode_type in "beamsearch" "sampling"; do if [ "$decode_type" = "beamsearch" ]; then decode_values=(4) elif [ "$decode_type" = "sampling" ]; then decode_values=(4 0.5) fi for request_batch_size in 1 4 16; do for input_length in 60; do for request_output_len in 80; do for decode_value in ${decode_values[@]}; do if [ "$decode_type" = "beamsearch" ]; then beam_width=$decode_value topk=0 topp=0.0 elif [ "$decode_type" = "sampling" ]; then beam_width=1 if [[ $decode_value == +([[:digit:]]) ]]; then topk=$decode_value topp=0.0 else topk=0 topp=$decode_value fi fi tmp_log=${logdir}/batchsize-${request_batch_size}-decode_value-${decode_value}-${input_length}-${request_output_len}-${decode_type}-${decode_value}.log python ../examples/pytorch/gpt/utils/generate_start_ids.py --max_batch_size ${request_batch_size} --max_input_length ${input_length} ./bin/gpt_gemm ${request_batch_size} ${beam_width} ${input_length} ${head_num} ${size_per_head} ${inter_size} ${vocab_size} 1 ${tensor_para_size} python ../examples/pytorch/gpt/utils/generate_gpt_config.py \ --max_seq_len 1024 \ --beam_width ${beam_width} \ --head_num ${head_num} \ --size_per_head ${size_per_head} \ --inter_size ${inter_size} \ --num_layer ${num_layer} \ -v 50304 \ -d fp16 \ -topk ${topk} \ -topp ${topp} \ --tensor_para_size ${tensor_para_size} \ --pipeline_para_size ${pipeline_para_size} \ -request_batch_size ${request_batch_size} \ --request_output_len ${request_output_len} mpirun -n ${total_gpu_count} --allow-run-as-root ./bin/multi_gpu_gpt_example .tmp.config.ini 2>&1 | tee ${tmp_log} ft_latency=`tail -n 1 ${tmp_log} | head -n 1 | awk '{print $17}'` echo "" | awk -v ft_latency=$ft_latency \ -v batch_size=$request_batch_size \ -v input_length=${input_length} -v request_output_len="$request_output_len" \ -v model_size=${model_size} -v decode_value="$decode_value" -v decode_type="$decode_type" \ '{printf "| %5s | %3d | %4d | %4d | %10s %5s | FP16 | %7.2f |\n", model_size, batch_size, input_length, request_output_len, decode_type, decode_value, ft_latency}' >> $all_log rm .tmp.config.ini done # decode_values done # request_output_len done # input_length done # batch_size done # decode_type done # model_size