# DOWNLOAD_DIR: directory to download and load model weights.
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
# 5. The final result will be saved in RESULT file.
...
@@ -30,31 +34,27 @@
...
@@ -30,31 +34,27 @@
TAG=$(date +"%Y_%m_%d_%H_%M")
TAG=$(date +"%Y_%m_%d_%H_%M")
BASE=""
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
MODEL="meta-llama/Llama-3.1-8B-Instruct"
TP=1
DOWNLOAD_DIR=""
DOWNLOAD_DIR=""
INPUT_LEN=4000
INPUT_LEN=4000
OUTPUT_LEN=16
OUTPUT_LEN=16
MIN_CACHE_HIT_PCT_PCT=0
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000
MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
RESULT="$LOG_FOLDER/result.txt"
echo"result file$ $RESULT"
echo"result file:$RESULT"
echo"model: $MODEL"
echo"model: $MODEL"
echo
rm-rf$LOG_FOLDER
rm-rf$LOG_FOLDER
mkdir-p$LOG_FOLDER
mkdir-p$LOG_FOLDER
cd"$BASE/vllm"
cd"$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input