benchmark_70b.sh

#!/bin/bash
if [ -z "$1" ]
then
    echo "Error. Please input the model path of llama2-70b model"
    exit 1
fi

workspace_dir=$(dirname $(realpath "$0"))

tp=4
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"

# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
    exit 1
fi

# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini

apt-get update
apt-get install crudini -y

crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 4000
crudini --set ${config_path} llama max_batch_size 256
# end of update config

cd ${workspace_dir}

# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

benchmark_rpm () {
    output_path=$1
    mkdir -p "${output_path}"

    batches=(64 128 256)
    for batch in "${batches[@]}"
    do
        for i in {1..3}
        do
        python3 profile_throughput.py \
            ShareGPT_V3_unfiltered_cleaned_split.json \
            ${turbomind_model_path} \
            --concurrency "$batch" \
            --num_prompts 3000 \
            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
        done
    done
}

benchmark_generation () {
    output_path=$1
    mkdir -p "${output_path}"

    python3 profile_generation.py \
        ${turbomind_model_path} \
        --concurrency 1 64 128 256 \
        --csv ${output_path}/generation.csv
}

output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation  ${output_path}