modelpath="/public/home/sugon_libo/tests/llms/llama2-13b"

logdir="./log/llama2-13b"

if [ ! -f ${logdir} ]; then
    mkdir ${logdir} -p
fi
all_log="${logdir}/all-log.log"


echo -e "| num prompts | Input length | Output length |  All Throughput (tokens/s) |  Gen Throughput (tokens/s) | Latency (s) |" > $all_log
echo -e "|:----------:|:------------:|:-------------:|:-------------:|:-------------:|:-------------:|" >> $all_log


export VLLM_WORKER_MULTIPROC_METHOD=spawn
export FA_PAD=0
export ROCBLAS_COMPUTETYPE_FP16R=0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_LAUNCH_MODE=GROUP
export NCCL_MAX_NCHANNELS=20
export NCCL_MIN_NCHANNELS=20
export NCCL_P2P_LEVEL=SYS
export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH


for num_prompts in 1 2 4 8 16 32 64 128 256; do

    for prompt_tuple in  "1000 1"  "1000 1000" "2000 1" "2000 2000" "4096 1" ; do
        IFS=' ' read -r input_len output_len <<< "$prompt_tuple"


        tmp_log=${logdir}/numprompts-${num_prompts}-input_len-${input_len}-output_len-${output_len}.log

        python3 ./benchmarks/benchmark_throughput.py --enforce-eager --input-len $input_len --output-len $output_len --num-prompts $num_prompts --tensor-parallel-size 2  --model $modelpath  --dtype float16   --trust-remote-code  2>&1 | tee ${tmp_log}


        avg_latency=`tail -n 7 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $2}'`
        all_tht=`tail -n 6 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $5}'`
        gen_tht=`tail -n 5 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $3}'`


        echo "" | awk -v all_tht=$all_tht \
                    -v gen_tht=$gen_tht \
                    -v avg_latency=$avg_latency \
                    -v num_prompts=$num_prompts \
                    -v input_len=${input_len} -v output_len="$output_len" \
                    '{printf "| %6d | %6d | %6d | %7.2f | %7.2f | %7.2f |\n", num_prompts, input_len, output_len,
                    all_tht, gen_tht, avg_latency}' >> $all_log

    done # input_len output_len
done # num_prompts