modelpath="/public/home/sugon_libo/tests/llms/llama2-13b" logdir="./log/llama2-13b" if [ ! -f ${logdir} ]; then mkdir ${logdir} -p fi all_log="${logdir}/all-log.log" echo -e "| num prompts | Input length | Output length | All Throughput (tokens/s) | Gen Throughput (tokens/s) | Latency (s) |" > $all_log echo -e "|:----------:|:------------:|:-------------:|:-------------:|:-------------:|:-------------:|" >> $all_log export VLLM_WORKER_MULTIPROC_METHOD=spawn export FA_PAD=0 export ROCBLAS_COMPUTETYPE_FP16R=0 export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_LAUNCH_MODE=GROUP export NCCL_MAX_NCHANNELS=20 export NCCL_MIN_NCHANNELS=20 export NCCL_P2P_LEVEL=SYS export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH for num_prompts in 1 2 4 8 16 32 64 128 256; do for prompt_tuple in "1000 1" "1000 1000" "2000 1" "2000 2000" "4096 1" ; do IFS=' ' read -r input_len output_len <<< "$prompt_tuple" tmp_log=${logdir}/numprompts-${num_prompts}-input_len-${input_len}-output_len-${output_len}.log python3 ./benchmarks/benchmark_throughput.py --enforce-eager --input-len $input_len --output-len $output_len --num-prompts $num_prompts --tensor-parallel-size 2 --model $modelpath --dtype float16 --trust-remote-code 2>&1 | tee ${tmp_log} avg_latency=`tail -n 7 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $2}'` all_tht=`tail -n 6 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $5}'` gen_tht=`tail -n 5 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $3}'` echo "" | awk -v all_tht=$all_tht \ -v gen_tht=$gen_tht \ -v avg_latency=$avg_latency \ -v num_prompts=$num_prompts \ -v input_len=${input_len} -v output_len="$output_len" \ '{printf "| %6d | %6d | %6d | %7.2f | %7.2f | %7.2f |\n", num_prompts, input_len, output_len, all_tht, gen_tht, avg_latency}' >> $all_log done # input_len output_len done # num_prompts