echo "data_type,batch,prompt_tokens,completion_tokens,tp,all_throughput(tokens/s),generate_throughput(tokens/s),latency(s)" > qwen2.5_14b-tp2.csv
pairs=("2 1" "2 128" "2 1000"  "2 2000" "512 1" "512 512" "1000 1"  "1000 1000" "2000 1" "2000 2000" "4096 1")

model_path="/workspace/llms/qwen2.5/Qwen2.5-14B-Instruct"

model=${model_path##*/}
tp=2
data_type="fp16"

for batch in 1 2 4 8 16 32 64 128; do
    for pair in "${pairs[@]}"; do
        prompt_tokens=${pair%% *}
        completion_tokens=${pair#* }
i
        echo "data_type: $data_type,batch: $batch, prompt_tokens: $prompt_tokens, completion_tokens: $completion_tokens, tp: ${tp}"

        # benchmark_throughput.py
        python ./benchmarks/benchmark_throughput.py --model ${model_path} --tensor-parallel-size ${tp} --num-prompts ${batch} --input-len ${prompt_tokens} --output-len ${completion_tokens} --dtype float16 --trust-remote-code --enforce-eager 2>&1 | tee vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log
    
        all_tht=`grep "^All Throughput" vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log | awk -F ' ' '{print $5}'`
        gen_tht=`grep "^Generate Throughput" vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log | awk -F ' ' '{print $3}'`
        latency=`grep "^Latency" vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log | awk -F ' ' '{print $2}'`
        
        if [ -z "$all_tht" ]; then
            echo "error: not All Throughput"
            exit 1;
        elif [ -z "$latency" ]; then
            echo "error: not Latency"
            exit 1;
        fi

        echo "$data_type,$batch,$prompt_tokens,$completion_tokens,$tp,$all_tht,$gen_tht,$latency" >> qwen2.5_14b-tp2.csv
    done
done