run-performance-benchmarks.sh 16.2 KB
Newer Older
1
2
3
4
5
6
7
8
#!/bin/bash

# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/

# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
9
set -x
10
11
12
set -o pipefail

check_gpus() {
13
14
15
16
17
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
18
19
  elif command -v hl-smi; then
    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
20
21
  fi

22
23
24
25
26
27
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
28
29
30
  
  declare -g arch_suffix=''
  
31
32
33
34
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
35
36
37
  elif command -v hl-smi; then
    declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
    arch_suffix='-hpu'
38
  fi
39
40
41
  echo "GPU type is $gpu_type"
}

42
43
check_cpus() {
  # check the number of CPUs and NUMA Node and GPU type.
44
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
45
46
47
48
49
50
51
52
53
54
55
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
  fi
  declare -g gpu_type="cpu"
  echo "GPU type is $gpu_type"
}

56
57
58
59
60
61
62
63
64
65
66
67
68
check_hf_token() {
  # check if HF_TOKEN is available and valid
  if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: HF_TOKEN is not set."
    exit 1
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
    echo "Error: HF_TOKEN does not start with 'hf_'."
    exit 1
  else
    echo "HF_TOKEN is set and valid."
  fi
}

69
70
71
ensure_sharegpt_downloaded() {
  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
  if [ ! -f "$FILE" ]; then
72
    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
73
  else
74
    echo "$FILE already exists."
75
76
77
  fi
}

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
}

94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
json2envs() {
  # transforms the JSON string to environment variables.
  # example:
  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
  # output: VLLM_CPU_KVCACHE_SPACE=5
  local json_string=$1
  local args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map((.key ) + "=" + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
}

110
111
112
113
wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
Simon Mo's avatar
Simon Mo committed
114
    until curl -X POST localhost:8000/v1/completions; do
115
116
117
118
      sleep 1
    done' && return 0 || return 1
}

119
120
121
122
123
124
125
126
127
128
129
130
131
kill_processes_launched_by_current_bash() {
  # Kill all python processes launched from current bash script
  current_shell_pid=$$
  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
  if [ -n "$processes" ]; then
    echo "Killing the following processes matching '$1':"
    echo "$processes"
    echo "$processes" | xargs kill -9
  else
    echo "No processes found matching '$1'."
  fi
}

132
133
kill_gpu_processes() {

134
135
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
136
  pgrep python3 | xargs -r kill -9
137
138
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
  pgrep VLLM | xargs -r kill -9
139

140
  # wait until GPU memory usage smaller than 1GB
141
142
143
144
145
146
147
148
  if command -v nvidia-smi; then
    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
      sleep 1
    done
  elif command -v amd-smi; then
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
      sleep 1
    done
149
150
151
152
  elif command -v hl-smi; then
    while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
      sleep 1
    done
153
  fi
154
155
156
157
158
159
160
161
162
163

  # remove vllm config file
  rm -rf ~/.config/vllm

}

upload_to_buildkite() {
  # upload the benchmarking results to buildkite

  # if the agent binary is not found, skip uploading the results, exit 0
Simon Mo's avatar
Simon Mo committed
164
165
166
167
168
169
  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
  if command -v buildkite-agent >/dev/null 2>&1; then
    BUILDKITE_AGENT_COMMAND="buildkite-agent"
  elif [ -f /workspace/buildkite-agent ]; then
    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
  else
170
171
172
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
Simon Mo's avatar
Simon Mo committed
173
174

  # Use the determined command to annotate and upload artifacts
175
  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
Simon Mo's avatar
Simon Mo committed
176
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
177
178
179
}

run_latency_tests() {
180
  # run latency tests using `vllm bench latency` command
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
  # $1: a json file specifying latency test cases

  local latency_test_file
  latency_test_file=$1

  # Iterate over latency tests
  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^latency_ ]]; then
      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi

    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi

    # get arguments
    latency_params=$(echo "$params" | jq -r '.parameters')
    latency_args=$(json2args "$latency_params")
204
205
    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    latency_envs=$(json2envs "$latency_environment_variables")
206
207
208

    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
209
210
211
212
213
    if [ "$ON_CPU" == "1" ]; then
      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
214
215
216
217
218
219
220
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
221
222
    fi

223
    latency_command=" $latency_envs vllm bench latency \
224
225
226
227
228
229
230
231
232
233
234
235
236
237
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $latency_args"

    echo "Running test case $test_name"
    echo "Latency command: $latency_command"

    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg latency "$latency_command" \
      --arg gpu "$gpu_type" \
      '{
        latency_command: $latency,
        gpu_type: $gpu
      }')
238
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
239
240
241
242
243
244
245
246
247
248

    # run the benchmark
    eval "$latency_command"

    kill_gpu_processes

  done
}

run_throughput_tests() {
249
  # run throughput tests using `vllm bench throughput`
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
  # $1: a json file specifying throughput test cases

  local throughput_test_file
  throughput_test_file=$1

  # Iterate over throughput tests
  jq -c '.[]' "$throughput_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^throughput_ ]]; then
      echo "In throughput-test.json, test_name must start with \"throughput_\"."
      exit 1
    fi

    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi

    # get arguments
    throughput_params=$(echo "$params" | jq -r '.parameters')
    throughput_args=$(json2args "$throughput_params")
273
274
    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    throughput_envs=$(json2envs "$throughput_environment_variables")
275
276

    # check if there is enough GPU to run the test
277
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
278
279
280
281
282
    if [ "$ON_CPU" == "1" ]; then
      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
283
284
285
286
287
288
289
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
290
291
    fi

292
    throughput_command=" $throughput_envs vllm bench throughput \
293
294
295
296
297
298
299
300
301
302
303
304
305
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"

    echo "Running test case $test_name"
    echo "Throughput command: $throughput_command"
    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg command "$throughput_command" \
      --arg gpu "$gpu_type" \
      '{
        throughput_command: $command,
        gpu_type: $gpu
      }')
306
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
307
308
309
310
311
312
313
314
315
316

    # run the benchmark
    eval "$throughput_command"

    kill_gpu_processes

  done
}

run_serving_tests() {
317
  # run serving tests using `vllm bench serve` command
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
  # $1: a json file specifying serving test cases

  local serving_test_file
  serving_test_file=$1

  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
      echo "In serving-test.json, test_name must start with \"serving_\"."
      exit 1
    fi

    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi

    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
340
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
341
342
    client_params=$(echo "$params" | jq -r '.client_parameters')
    server_args=$(json2args "$server_params")
343
    server_envs=$(json2envs "$server_envs")
344
345
346
347
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
348
349
350
351
352
353
354
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
        max_concurrency_list="[$num_prompts]"
    fi
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
    echo "Running over max concurrency list $max_concurrency_list"
355

356
    # check if there is enough resources to run the test
357
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
358
359
360
361
362
    if [ "$ON_CPU" == "1" ]; then
      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
363
364
365
366
367
368
369
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
370
371
372
373
374
375
    fi

    # check if server model and client model is aligned
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
376
      echo "Server model and client model must be the same. Skip testcase $test_name."
377
378
379
      continue
    fi

380
    server_command="$server_envs vllm serve \
381
382
383
384
385
      $server_args"

    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
386
387
388
389
390
391
392
393
394
395
396
397
398
    # support remote vllm server
    client_remote_args=""
    if [[ -z "${REMOTE_HOST}" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
      if wait_for_server; then
        echo ""
        echo "vLLM server is up and running."
      else
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
399
    else
400
401
402
403
404
405
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
      else
        client_remote_args=" --host=$REMOTE_HOST "
      fi
406
407
408
409
410
411
412
413
414
415
416
    fi

    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi

417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
        # pass the tensor parallel size to the client so that it can be displayed
        # on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
          --metadata "tensor_parallel_size=$tp" \
          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"

        bash -c "$client_command"

        # record the benchmarking commands
        jq_output=$(jq -n \
          --arg server "$server_command" \
          --arg client "$client_command" \
          --arg gpu "$gpu_type" \
          '{
            server_command: $server,
            client_command: $client,
            gpu_type: $gpu
          }')
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

      done
450
451
452
    done

    # clean up
Simon Mo's avatar
Simon Mo committed
453
    kill -9 $server_pid
454
455
456
457
458
    kill_gpu_processes
  done
}

main() {
459
460
461
462
463
464
465
  local ARCH
  ARCH=''
  if [ "$ON_CPU" == "1" ];then
     check_cpus
     ARCH='-cpu'
  else
     check_gpus
466
     ARCH="$arch_suffix"
467
  fi
468
469
470
471
472
  check_hf_token

  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
473
  (which lsof) || (apt-get update && apt-get install -y lsof)
474

475
  # get the current IP address, required by `vllm bench serve` command
476
477
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
478
  export VLLM_LOGGING_LEVEL="WARNING"
479
480
481

  # prepare for benchmarking
  cd benchmarks || exit 1
482
  ensure_sharegpt_downloaded
483
484
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
485
  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
486

487
488
489
490
491
  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)

  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

492
  # benchmarking
493
494
495
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
496
497
498
499
500
501
502
503
504

  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py

  upload_to_buildkite
}

main "$@"