[Benchmark] Add startup benchmarking to buildkite run (#33183)

Signed-off-by: Bin Bao <binbao@meta.com>

[Benchmark] Add startup benchmarking to buildkite run (#33183)
Signed-off-by: Bin Bao <binbao@meta.com>
392c5af4 · Bin Bao · GitHub · af9b69f9 · 392c5af4
Unverified Commit 392c5af4 authored Jan 28, 2026 by Bin Bao Committed by GitHub Jan 28, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 35 additions and 88 deletions

.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ...formance-benchmarks/scripts/run-performance-benchmarks.sh +35 -88

No files found.
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -25,9 +25,9 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
  declare -g arch_suffix=''
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
@@ -181,19 +181,20 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
-run_latency_tests() {
+run_benchmark_tests() {
-  # run latency tests using `vllm bench latency` command
+  # run benchmark tests using `vllm bench <test_type>` command
-  # $1: a json file specifying latency test cases
+  # $1: test type (latency or throughput)
+  # $2: a json file specifying test cases
-  local latency_test_file
+  local test_type=$1
-  latency_test_file=$1
+  local test_file=$2
-  # Iterate over latency tests
+  # Iterate over tests
-  jq -c '.[]' "$latency_test_file" | while read -r params; do
+  jq -c '.[]' "$test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^latency_ ]]; then
+    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
-      echo "In latency-test.json, test_name must start with \"latency_\"."
+      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
      exit 1
    fi
@@ -204,15 +205,15 @@ run_latency_tests() {
    fi
    # get arguments
-    latency_params=$(echo "$params" | jq -r '.parameters')
+    bench_params=$(echo "$params" | jq -r '.parameters')
-    latency_args=$(json2args "$latency_params")
+    bench_args=$(json2args "$bench_params")
-    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    latency_envs=$(json2envs "$latency_environment_variables")
+    bench_envs=$(json2envs "$bench_environment_variables")
    # check if there is enough GPU to run the test
-    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
+      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -225,97 +226,42 @@ run_latency_tests() {
      fi
    fi
-    latency_command=" $latency_envs vllm bench latency \
+    bench_command=" $bench_envs vllm bench $test_type \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $latency_args"
+      $bench_args"
    echo "Running test case $test_name"
-    echo "Latency command: $latency_command"
+    echo "${test_type^} command: $bench_command"
-    # recoding benchmarking command ang GPU command
+    # recording benchmarking command and GPU command
    jq_output=$(jq -n \
-      --arg latency "$latency_command" \
+      --arg command "$bench_command" \
      --arg gpu "$gpu_type" \
+      --arg test_type "$test_type" \
      '{
-        latency_command: $latency,
+        ($test_type + "_command"): $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
-    eval "$latency_command"
+    eval "$bench_command"
    kill_gpu_processes
  done
 }
-run_throughput_tests() {
+run_latency_tests() {
-  # run throughput tests using `vllm bench throughput`
+  run_benchmark_tests "latency" "$1"
-  # $1: a json file specifying throughput test cases
+}
-  local throughput_test_file
-  throughput_test_file=$1
-  # Iterate over throughput tests
-  jq -c '.[]' "$throughput_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^throughput_ ]]; then
-      echo "In throughput-test.json, test_name must start with \"throughput_\"."
-      exit 1
-    fi
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-    # get arguments
-    throughput_params=$(echo "$params" | jq -r '.parameters')
-    throughput_args=$(json2args "$throughput_params")
-    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    throughput_envs=$(json2envs "$throughput_environment_variables")
-    # check if there is enough GPU to run the test
-    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
-        continue
-      fi
-    else
-      if [[ $gpu_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-        continue
-      fi
-    fi
-    throughput_command=" $throughput_envs vllm bench throughput \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $throughput_args"
-    echo "Running test case $test_name"
-    echo "Throughput command: $throughput_command"
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg command "$throughput_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        throughput_command: $command,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
-    # run the benchmark
-    eval "$throughput_command"
-    kill_gpu_processes
+run_startup_tests() {
+  run_benchmark_tests "startup" "$1"
+}
-  done
+run_throughput_tests() {
+  run_benchmark_tests "throughput" "$1"
 }
 run_serving_tests() {
@@ -534,6 +480,7 @@ main() {
  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
+  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
  # postprocess benchmarking results