#!/bin/bash if [ -z "$1" ] then echo "Error. Please input the model path of llama2-13b model" exit 1 fi workspace_dir=$(dirname $(realpath "$0")) tp=1 model_path="$1" model_foldername=$(basename "$model_path") turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" # convert lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} if [ $? != 0 ] then exit 1 fi # update recommended config to config.ini config_path=${turbomind_model_path}/triton_models/weights/config.ini apt-get update apt-get install crudini -y crudini --set ${config_path} llama max_context_token_num 4 crudini --set ${config_path} llama cache_chunk_size -1 crudini --set ${config_path} llama cache_max_entry_count 500 crudini --set ${config_path} llama max_batch_size 128 # end of update config cd ${workspace_dir} # download dataset wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json benchmark_rpm () { output_path=$1 mkdir -p "${output_path}" batches=(64 128) for batch in "${batches[@]}" do for i in {1..3} do python3 profile_throughput.py \ ShareGPT_V3_unfiltered_cleaned_split.json \ ${turbomind_model_path} \ --concurrency "$batch" \ --num_prompts 3000 \ --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv done done } benchmark_generation () { output_path=$1 mkdir -p "${output_path}" python3 profile_generation.py \ ${turbomind_model_path} \ --concurrency 1 16 32 64 \ --csv ${output_path}/generation.csv } ################################# BENCHMARK AFTER TUNING GEMM ################################# # tune gemm head_num=$(crudini --get "${config_path}" llama head_num) size_per_head=$(crudini --get "${config_path}" llama size_per_head) vocab_size=$(crudini --get "${config_path}" llama vocab_size) inter_size=$(crudini --get "${config_path}" llama inter_size) tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size) max_batch_size=$(crudini --get "${config_path}" llama max_batch_size) echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size python3 -m lmdeploy.turbomind.generate_gemm_config \ --head_num ${head_num} \ --size_per_head ${size_per_head} \ --vocab_size ${vocab_size} \ --inter_size ${inter_size} \ --tensor_para_size ${tensor_para_size} \ --max_batch_size ${max_batch_size} output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}" # benchmark request throughput and static inference benchmark_rpm ${output_path} benchmark_generation ${output_path} mv gemm_config.in ${output_path}