echo "data_type,batch,prompt_tokens,completion_tokens,tp,all_throughput(tokens/s),generate_throughput(tokens/s),latency(s)" > qwen2.5_14b-tp2.csv pairs=("2 1" "2 128" "2 1000" "2 2000" "512 1" "512 512" "1000 1" "1000 1000" "2000 1" "2000 2000" "4096 1") model_path="/workspace/llms/qwen2.5/Qwen2.5-14B-Instruct" model=${model_path##*/} tp=2 data_type="fp16" for batch in 1 2 4 8 16 32 64 128; do for pair in "${pairs[@]}"; do prompt_tokens=${pair%% *} completion_tokens=${pair#* } i echo "data_type: $data_type,batch: $batch, prompt_tokens: $prompt_tokens, completion_tokens: $completion_tokens, tp: ${tp}" # benchmark_throughput.py python ./benchmarks/benchmark_throughput.py --model ${model_path} --tensor-parallel-size ${tp} --num-prompts ${batch} --input-len ${prompt_tokens} --output-len ${completion_tokens} --dtype float16 --trust-remote-code --enforce-eager 2>&1 | tee vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log all_tht=`grep "^All Throughput" vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log | awk -F ' ' '{print $5}'` gen_tht=`grep "^Generate Throughput" vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log | awk -F ' ' '{print $3}'` latency=`grep "^Latency" vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log | awk -F ' ' '{print $2}'` if [ -z "$all_tht" ]; then echo "error: not All Throughput" exit 1; elif [ -z "$latency" ]; then echo "error: not Latency" exit 1; fi echo "$data_type,$batch,$prompt_tokens,$completion_tokens,$tp,$all_tht,$gen_tht,$latency" >> qwen2.5_14b-tp2.csv done done