Unverified Commit 392c5af4 authored by Bin Bao's avatar Bin Bao Committed by GitHub
Browse files

[Benchmark] Add startup benchmarking to buildkite run (#33183)


Signed-off-by: default avatarBin Bao <binbao@meta.com>
parent af9b69f9
...@@ -181,19 +181,20 @@ upload_to_buildkite() { ...@@ -181,19 +181,20 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
run_latency_tests() { run_benchmark_tests() {
# run latency tests using `vllm bench latency` command # run benchmark tests using `vllm bench <test_type>` command
# $1: a json file specifying latency test cases # $1: test type (latency or throughput)
# $2: a json file specifying test cases
local latency_test_file local test_type=$1
latency_test_file=$1 local test_file=$2
# Iterate over latency tests # Iterate over tests
jq -c '.[]' "$latency_test_file" | while read -r params; do jq -c '.[]' "$test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^latency_ ]]; then if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
echo "In latency-test.json, test_name must start with \"latency_\"." echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
exit 1 exit 1
fi fi
...@@ -204,15 +205,15 @@ run_latency_tests() { ...@@ -204,15 +205,15 @@ run_latency_tests() {
fi fi
# get arguments # get arguments
latency_params=$(echo "$params" | jq -r '.parameters') bench_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params") bench_args=$(json2args "$bench_params")
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables') bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
latency_envs=$(json2envs "$latency_environment_variables") bench_envs=$(json2envs "$bench_environment_variables")
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1') pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp)) world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
...@@ -225,97 +226,42 @@ run_latency_tests() { ...@@ -225,97 +226,42 @@ run_latency_tests() {
fi fi
fi fi
latency_command=" $latency_envs vllm bench latency \ bench_command=" $bench_envs vllm bench $test_type \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args" $bench_args"
echo "Running test case $test_name" echo "Running test case $test_name"
echo "Latency command: $latency_command" echo "${test_type^} command: $bench_command"
# recoding benchmarking command ang GPU command # recording benchmarking command and GPU command
jq_output=$(jq -n \ jq_output=$(jq -n \
--arg latency "$latency_command" \ --arg command "$bench_command" \
--arg gpu "$gpu_type" \ --arg gpu "$gpu_type" \
--arg test_type "$test_type" \
'{ '{
latency_command: $latency, ($test_type + "_command"): $command,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$latency_command" eval "$bench_command"
kill_gpu_processes kill_gpu_processes
done done
} }
run_throughput_tests() { run_latency_tests() {
# run throughput tests using `vllm bench throughput` run_benchmark_tests "latency" "$1"
# $1: a json file specifying throughput test cases }
local throughput_test_file
throughput_test_file=$1
# Iterate over throughput tests
jq -c '.[]' "$throughput_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^throughput_ ]]; then
echo "In throughput-test.json, test_name must start with \"throughput_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
throughput_envs=$(json2envs "$throughput_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi
throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
echo "Running test case $test_name"
echo "Throughput command: $throughput_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$throughput_command" \
--arg gpu "$gpu_type" \
'{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes run_startup_tests() {
run_benchmark_tests "startup" "$1"
}
done run_throughput_tests() {
run_benchmark_tests "throughput" "$1"
} }
run_serving_tests() { run_serving_tests() {
...@@ -534,6 +480,7 @@ main() { ...@@ -534,6 +480,7 @@ main() {
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
# postprocess benchmarking results # postprocess benchmarking results
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment