"lib/llm/src/vscode:/vscode.git/clone" did not exist on "045b61dd5bb736973028b2d17b32359d8e4f9e25"
Unverified Commit 01a634d1 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat: SGLang FP8 improvements and vLLM benchmark enhancements (#4675)

parent c9fdc2ea
...@@ -27,19 +27,42 @@ chosen_req_rate=$8 ...@@ -27,19 +27,42 @@ chosen_req_rate=$8
echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[@]}; ${chosen_req_rate}" echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[@]}; ${chosen_req_rate}"
wait_for_model_timeout=1500 # 25 minutes wait_for_model_timeout=3000
wait_for_model_check_interval=5 # check interval -> 5s wait_for_model_check_interval=5
wait_for_model_report_interval=60 # wait_for_model report interval -> 60s wait_for_model_report_interval=60
wait_for_model $head_node $head_port $n_prefill $n_decode $wait_for_model_check_interval $wait_for_model_timeout $wait_for_model_report_interval wait_for_model $head_node $head_port $n_prefill $n_decode \
$wait_for_model_check_interval $wait_for_model_timeout $wait_for_model_report_interval
set -e set -e
# Warmup the model with a sweep of concurrencies
# Warmup defaults
warmup_isl=$chosen_isl warmup_isl=$chosen_isl
warmup_osl=$chosen_osl warmup_osl=$chosen_osl
warmup_req_rate=250 warmup_req_rate=250
warmup_concurrency_list=(1 4 8 32 64 128 256 512 1024 4096) warmup_concurrency_list=(1 4 8 32 64 128 256 512)
# Ensure all chosen concurrencies are in warmup list
for c in "${chosen_concurrencies[@]}"; do
found=false
for w in "${warmup_concurrency_list[@]}"; do
if [[ "$c" == "$w" ]]; then
found=true
break
fi
done
if [[ "$found" == false ]]; then
warmup_concurrency_list+=("$c")
fi
done
# Optional: sort warmup list numerically
IFS=$'\n' warmup_concurrency_list=($(sort -n <<<"${warmup_concurrency_list[*]}"))
unset IFS
echo "Final warmup list: ${warmup_concurrency_list[@]}"
# Warmup
for warmup_concurrency in "${warmup_concurrency_list[@]}" for warmup_concurrency in "${warmup_concurrency_list[@]}"
do do
echo "Warming up model with concurrency $warmup_concurrency" echo "Warming up model with concurrency $warmup_concurrency"
...@@ -73,7 +96,7 @@ for concurrency in "${chosen_concurrencies[@]}" ...@@ -73,7 +96,7 @@ for concurrency in "${chosen_concurrencies[@]}"
do do
num_prompts=$((concurrency * 5)) num_prompts=$((concurrency * 5))
echo "Running benchmark with concurrency: $concurrency and num-prompts: $num_prompts, writing to file ${result_dir}" echo "Running benchmark with concurrency: $concurrency and num-prompts: $num_prompts, writing to file ${result_dir}"
result_filename="isl_${chosen_isl}_osl_${chosen_osl}_concurrency_${concurrency}_req_rate_${chosen_req_rate}_ctx${prefill_gpus}_gen${decode_gpus}.json" result_filename="isl_${chosen_isl}_osl_${chosen_osl}_concurrency_${concurrency}_req_rate_${chosen_req_rate}_ctx_${prefill_gpus}_gen_${decode_gpus}_gpus_${total_gpus}.json"
set -x set -x
echo "$(date '+%Y-%m-%d %H:%M:%S')" echo "$(date '+%Y-%m-%d %H:%M:%S')"
...@@ -98,4 +121,3 @@ do ...@@ -98,4 +121,3 @@ do
echo "Completed benchmark with concurrency: $concurrency" echo "Completed benchmark with concurrency: $concurrency"
echo "-----------------------------------------" echo "-----------------------------------------"
done done
set +e
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment