Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.764
- name: "exact_match,flexible-extract"
value: 0.764
limit: 250
num_fewshot: 5
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.356
- name: "exact_match,flexible-extract"
value: 0.358
limit: 1000
num_fewshot: 5
Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base-FP8.yaml Minitron-4B-Base-FP8.yaml
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 # pip install lm-eval==0.4.4
usage() { usage() {
echo`` echo``
...@@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do ...@@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
done done
lm_eval --model hf \ lm_eval --model hf \
--model_args pretrained=$MODEL,parallelize=True \ --model_args "pretrained=$MODEL,parallelize=True" \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size $BATCH_SIZE --batch_size "$BATCH_SIZE"
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.3 # pip install lm-eval==0.4.4
usage() { usage() {
echo`` echo``
...@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do ...@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done done
lm_eval --model vllm \ lm_eval --model vllm \
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size $BATCH_SIZE --batch_size "$BATCH_SIZE"
...@@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do ...@@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
done done
# Parse list of configs. # Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do do
......
...@@ -49,10 +49,15 @@ def test_lm_eval_correctness(): ...@@ -49,10 +49,15 @@ def test_lm_eval_correctness():
results = launch_lm_eval(eval_config) results = launch_lm_eval(eval_config)
# Confirm scores match ground truth. # Confirm scores match ground truth.
success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
ground_truth = metric["value"] ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: ' print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}') f'ground_truth={ground_truth} | measured={measured_value}')
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) success = success and numpy.isclose(
ground_truth, measured_value, rtol=RTOL)
# Assert at the end, print all scores even on failure for debugging.
assert success
...@@ -9,8 +9,11 @@ steps: ...@@ -9,8 +9,11 @@ steps:
- image: badouralix/curl-jq - image: badouralix/curl-jq
command: command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait - wait
- label: "A100" - label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents: agents:
queue: A100 queue: A100
plugins: plugins:
...@@ -18,7 +21,7 @@ steps: ...@@ -18,7 +21,7 @@ steps:
podSpec: podSpec:
priorityClassName: perf-benchmark priorityClassName: perf-benchmark
containers: containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command: command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources: resources:
...@@ -41,20 +44,48 @@ steps: ...@@ -41,20 +44,48 @@ steps:
- name: devshm - name: devshm
emptyDir: emptyDir:
medium: Memory medium: Memory
# - label: "H100"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# command:
# - bash
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
# mount-buildkite-agent: true
# propagate-environment: true
# ipc: host
# gpus: all
# environment:
# - VLLM_USAGE_SOURCE
# - HF_TOKEN
- label: "H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: 4,5,6,7
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
- block: "Run H100 Benchmark"
key: block-h100
depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: block-h100
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
## Description
This file contains the downloading link for benchmarking results.
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
Please download the visualization scripts in the post
## Results reproduction
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code
```
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.
# Nightly benchmark # Nightly benchmark
The main goal of this benchmarking is two-fold: This benchmark aims to:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload. - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
## Docker images
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 ## Setup
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1 - Docker images:
- vLLM: `vllm/vllm-openai:v0.6.2`
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. --> - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
## Hardware - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
One AWS node with 8x NVIDIA A100 GPUs. - Hardware
- 8x Nvidia A100 GPUs
- Workload:
## Workload description - Dataset
- ShareGPT dataset
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed). - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Output length: the corresponding output length of these 500 prompts. - Models: llama-3 8B, llama-3 70B.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
# Known issues
## Plots
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed. - TGI does not support `ignore-eos` flag.
\ No newline at end of file
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
## Results
{nightly_results_benchmarking_table}
...@@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec ...@@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec
common_container_settings: &common_container_settings common_container_settings: &common_container_settings
command: command:
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
resources: resources:
limits: limits:
nvidia.com/gpu: 8 nvidia.com/gpu: 8
...@@ -37,7 +37,10 @@ common_container_settings: &common_container_settings ...@@ -37,7 +37,10 @@ common_container_settings: &common_container_settings
steps: steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
- label: "A100 trt benchmark"
- label: "A100 vllm step 10"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -46,7 +49,21 @@ steps: ...@@ -46,7 +49,21 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - image: vllm/vllm-openai:v0.6.2
<<: *common_container_settings
- label: "A100 sglang benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: lmsysorg/sglang:v0.3.2-cu121
<<: *common_container_settings <<: *common_container_settings
- label: "A100 lmdeploy benchmark" - label: "A100 lmdeploy benchmark"
...@@ -58,11 +75,13 @@ steps: ...@@ -58,11 +75,13 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: openmmlab/lmdeploy:v0.5.0 - image: openmmlab/lmdeploy:v0.6.1-cu12
<<: *common_container_settings <<: *common_container_settings
- label: "A100 vllm benchmark"
- label: "A100 trt llama-8B"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -71,10 +90,25 @@ steps: ...@@ -71,10 +90,25 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: vllm/vllm-openai:latest - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings <<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama8B"
- label: "A100 tgi benchmark"
- label: "A100 trt llama-70B"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -83,12 +117,54 @@ steps: ...@@ -83,12 +117,54 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: ghcr.io/huggingface/text-generation-inference:2.1 - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings <<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama70B"
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
# - label: "A100 trt benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
# <<: *common_container_settings
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
# - label: "A100 tgi benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
# <<: *common_container_settings
- wait - wait
- label: "Plot" - label: "Collect the results"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -117,4 +193,4 @@ steps: ...@@ -117,4 +193,4 @@ steps:
name: hf-token-secret name: hf-token-secret
key: token key: token
- wait - block: ":rocket: check the results!"
\ No newline at end of file \ No newline at end of file
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
main() {
check_gpus
check_hf_token
df -h
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
cd $VLLM_SOURCE_CODE_LOC/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# run lmdeploy
if which lmdeploy >/dev/null; then
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
exit 0
fi
# run tgi
if [ -e /tgi-entrypoint.sh ]; then
echo "tgi is available, redirect to run-tgi-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
exit 0
fi
# run trt
if which trtllm-build >/dev/null; then
echo "trtllm is available, redirect to run-trt-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
exit 0
fi
# run vllm
if [ -e /vllm-workspace ]; then
echo "vllm is available, redirect to run-vllm-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
exit 0
fi
}
main "$@"
\ No newline at end of file
...@@ -56,7 +56,7 @@ serving_column_mapping = { ...@@ -56,7 +56,7 @@ serving_column_mapping = {
def read_markdown(file): def read_markdown(file):
if os.path.exists(file): if os.path.exists(file):
with open(file, "r") as f: with open(file) as f:
return f.read() + "\n" return f.read() + "\n"
else: else:
return f"{file} not found.\n" return f"{file} not found.\n"
...@@ -75,14 +75,14 @@ if __name__ == "__main__": ...@@ -75,14 +75,14 @@ if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
if "serving" in str(test_file): if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py` # this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
...@@ -97,7 +97,7 @@ if __name__ == "__main__": ...@@ -97,7 +97,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_latency.py` # this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
...@@ -119,7 +119,7 @@ if __name__ == "__main__": ...@@ -119,7 +119,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_throughput.py` # this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
...@@ -157,6 +157,18 @@ if __name__ == "__main__": ...@@ -157,6 +157,18 @@ if __name__ == "__main__":
throughput_results, throughput_results,
serving_results) serving_results)
for df in [latency_results, serving_results, throughput_results]:
if df.empty:
continue
# Sort all dataframes by their respective "Test name" columns
df.sort_values(by="Test name", inplace=True)
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
# get markdown tables # get markdown tables
latency_md_table = tabulate(latency_results, latency_md_table = tabulate(latency_results,
headers='keys', headers='keys',
......
import argparse import argparse
import json import json
import math
from pathlib import Path from pathlib import Path
import matplotlib.pyplot as plt import numpy as np
import pandas as pd import pandas as pd
from tabulate import tabulate from tabulate import tabulate
...@@ -25,15 +24,55 @@ def parse_arguments(): ...@@ -25,15 +24,55 @@ def parse_arguments():
return args return args
def get_perf(df, method, model, metric):
means = []
for qps in [2, 4, 8, 16, "inf"]:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
target = target & df['Test name'].str.contains("qps_" + str(qps))
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
else:
means.append(filtered_df[metric].values[0])
return np.array(means)
def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist()
std = get_perf(df, method, model, "Std " + metric + " (ms)")
if std.mean() == 0:
std = None
success = get_perf(df, method, model, "Successful req.")
if std is not None:
std = std / np.sqrt(success)
std = std.tolist()
else:
assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)")
mean = mean.tolist()
std = None
return mean, std
def main(args): def main(args):
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
results_folder = Path(args.results_folder) results_folder = Path(args.results_folder)
results = [] results = []
# collect results # collect results
for test_file in results_folder.glob("*_nightly_results.json"): for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f: with open(test_file) as f:
results = results + json.loads(f.read()) results = results + json.loads(f.read())
# generate markdown table # generate markdown table
...@@ -41,7 +80,7 @@ def main(args): ...@@ -41,7 +80,7 @@ def main(args):
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f: with open(args.description) as f:
description = f.read() description = f.read()
description = description.format( description = description.format(
...@@ -50,85 +89,6 @@ def main(args): ...@@ -50,85 +89,6 @@ def main(args):
with open("nightly_results.md", "w") as f: with open("nightly_results.md", "w") as f:
f.write(description) f.write(description)
plt.rcParams.update({'font.size': 20})
# plot results
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
fig.subplots_adjust(hspace=1)
methods = ["vllm", "trt", "lmdeploy", "tgi"]
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
for j, metric in enumerate(["TTFT", "ITL"]):
means, stds = [], []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
stds.append(0.)
else:
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
std = filtered_df[f"Std {metric} (ms)"].values[0]
success = filtered_df["Successful req."].values[0]
stds.append(std / math.sqrt(success))
print(model, metric)
print(means, stds)
ax = axes[i, j + 1]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
means,
yerr=stds,
capsize=10,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel(f"{metric} (ms)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
metric = "Tput"
j = 0
if True:
tputs = []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
tputs.append(0.)
else:
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
tputs.append(input_tput + output_tput)
print(model, metric)
print(tputs)
ax = axes[i, j]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
tputs,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel("Tput (token/s)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
fig.tight_layout()
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
if __name__ == '__main__': if __name__ == '__main__':
args = parse_arguments() args = parse_arguments()
......
#!/bin/bash
# Currently FP8 benchmark is NOT enabled.
set -x
server_params=$1
common_params=$2
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
launch_trt_server() {
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
# create model caching directory
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
# clone tensorrt backend
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout "$trt_llm_version"
git submodule update --init --recursive
# build trtllm engine
cd /tensorrtllm_backend
cd "./tensorrt_llm/examples/${model_type}"
python3 convert_checkpoint.py \
--model_dir "${model_path}" \
--dtype "${model_dtype}" \
--tp_size "${model_tp_size}" \
--output_dir "${trt_model_path}"
trtllm-build \
--checkpoint_dir "${trt_model_path}" \
--use_fused_mlp \
--reduce_fusion disable \
--workers 8 \
--gpt_attention_plugin "${model_dtype}" \
--gemm_plugin "${model_dtype}" \
--tp_size "${model_tp_size}" \
--max_batch_size "${max_batch_size}" \
--max_input_len "${max_input_len}" \
--max_seq_len "${max_seq_len}" \
--max_num_tokens "${max_num_tokens}" \
--output_dir "${trt_engine_path}"
# handle triton protobuf files and launch triton server
cd /tensorrtllm_backend
mkdir triton_model_repo
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
cd triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size="${model_tp_size}" \
--model_repo=/tensorrtllm_backend/triton_model_repo &
}
launch_tgi_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
port=$(echo "$common_params" | jq -r '.port')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
echo "Server command: $server_command"
eval "$server_command" &
}
launch_lmdeploy_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
port=$(echo "$common_params" | jq -r '.port')
server_args=$(json2args "$server_params")
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Server command: $server_command"
bash -c "$server_command" &
}
launch_sglang_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
port=$(echo "$common_params" | jq -r '.port')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
fi
# run the server
echo "Server command: $server_command"
eval "$server_command" &
}
launch_vllm_server() {
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
port=$(echo "$common_params" | jq -r '.port')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Server command: $server_command"
eval "$server_command" &
}
main() {
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
launch_trt_server
fi
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
launch_tgi_server
fi
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
launch_lmdeploy_server
fi
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
launch_sglang_server
fi
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
launch_vllm_server
fi
}
main
#!/bin/bash
server_params=$1
common_params=$2
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
cd ~
rm -rf tensorrt-demo
git clone https://github.com/neuralmagic/tensorrt-demo.git
cd tensorrt-demo
tensorrt_demo_dir=$(pwd)
# make sure the parameter inside tensorrt_demo is consistent to envvar
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
python ../quantization/quantize.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path} \
--qformat fp8 \
--kv_cache_dtype fp8 \
--calib_size 2
else
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
fi
trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path}
cd /tensorrtllm_backend/triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &
\ No newline at end of file
...@@ -8,6 +8,7 @@ main() { ...@@ -8,6 +8,7 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which zip) || (apt-get install -y zip)
if [ ! -f /workspace/buildkite-agent ]; then if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results." echo "buildkite-agent binary not found. Skip plotting the results."
...@@ -15,26 +16,63 @@ main() { ...@@ -15,26 +16,63 @@ main() {
fi fi
# initial annotation # initial annotation
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results # download results
cd $VLLM_SOURCE_CODE_LOC/benchmarks cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
mkdir -p results/ mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls ls
ls results/ ls results/
# generate figures # upload benchmark results
python3 -m pip install tabulate pandas matplotlib zip -r results.zip results/
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ /workspace/buildkite-agent artifact upload "results.zip"
--description $description \
--results-folder results/ # upload benchmarking scripts
cd "$VLLM_SOURCE_CODE_LOC/"
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
# The figures should be genereated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
# --description $description \
# --results-folder results/
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sharegpt
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_2048_128
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_128_2048
# upload results and figures # # upload results and figures
/workspace/buildkite-agent artifact upload "nightly_results.png" # /workspace/buildkite-agent artifact upload "nightly_results*.png"
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
} }
main "$@" main "$@"
\ No newline at end of file
#!/bin/bash #!/bin/bash
set -o pipefail set -o pipefail
set -x
check_gpus() { check_gpus() {
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
...@@ -11,19 +12,70 @@ check_gpus() { ...@@ -11,19 +12,70 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking." echo "Need at least 1 GPU to run benchmarking."
exit 1 exit 1
fi fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
kill_gpu_processes() { check_hf_token() {
pkill lmdeploy || true # check if HF_TOKEN is available and valid
# waiting for GPU processes to be fully killed if [[ -z "$HF_TOKEN" ]]; then
sleep 10 echo "Error: HF_TOKEN is not set."
# Print the GPU memory usage exit 1
# so that we know if all GPU processes are killed. elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) echo "Error: HF_TOKEN does not start with 'hf_'."
# The memory usage should be 0 MB. exit 1
echo "GPU 0 Memory Usage: $gpu_memory_usage MB" else
echo "HF_TOKEN is set and valid."
fi
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
get_current_llm_serving_engine() {
if which lmdeploy >/dev/null; then
echo "Container: lmdeploy"
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
return
fi
if [ -e /tgi-entrypoint.sh ]; then
echo "Container: tgi"
export CURRENT_LLM_SERVING_ENGINE=tgi
return
fi
if which trtllm-build >/dev/null; then
echo "Container: tensorrt-llm"
export CURRENT_LLM_SERVING_ENGINE=trt
return
fi
if [ -e /sgl-workspace ]; then
echo "Container: sglang"
export CURRENT_LLM_SERVING_ENGINE=sglang
return
fi
if [ -e /vllm-workspace ]; then
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
} }
json2args() { json2args() {
...@@ -42,6 +94,19 @@ json2args() { ...@@ -42,6 +94,19 @@ json2args() {
echo "$args" echo "$args"
} }
kill_gpu_processes() {
pkill -f python
pkill -f python3
pkill -f tritonserver
pkill -f pt_main_thread
pkill -f text-generation
pkill -f lmdeploy
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
}
wait_for_server() { wait_for_server() {
# wait for vllm server to start # wait for vllm server to start
# return 1 if vllm server crashes # return 1 if vllm server crashes
...@@ -51,6 +116,14 @@ wait_for_server() { ...@@ -51,6 +116,14 @@ wait_for_server() {
done' && return 0 || return 1 done' && return 0 || return 1
} }
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
if ! which "$cmd" >/dev/null; then
apt-get update && apt-get install -y "$cmd"
fi
}
run_serving_tests() { run_serving_tests() {
# run serving tests using `benchmark_serving.py` # run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
...@@ -68,10 +141,10 @@ run_serving_tests() { ...@@ -68,10 +141,10 @@ run_serving_tests() {
echo "Skip test case $test_name." echo "Skip test case $test_name."
continue continue
fi fi
# append lmdeploy to the test name # prepend the current serving engine to the test name
test_name=lmdeploy_$test_name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
# get common parameters # get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters') common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model') model=$(echo "$common_params" | jq -r '.model')
...@@ -80,13 +153,11 @@ run_serving_tests() { ...@@ -80,13 +153,11 @@ run_serving_tests() {
dataset_path=$(echo "$common_params" | jq -r '.dataset_path') dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port') port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts') num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
# get client and server arguments # get client and server arguments
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
...@@ -94,41 +165,43 @@ run_serving_tests() { ...@@ -94,41 +165,43 @@ run_serving_tests() {
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue continue
fi fi
if [[ $reuse_server == "true" ]]; then
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi
if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break
fi
# prepare tokenizer # prepare tokenizer
# this is required for lmdeploy.
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
rm -rf /tokenizer_cache rm -rf /tokenizer_cache
mkdir /tokenizer_cache mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \ --model "$model" \
--cachedir /tokenizer_cache --cachedir /tokenizer_cache
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
# wait until the server is alive # change model name for lmdeploy (it will not follow standard hf name)
wait_for_server if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
if [ $? -eq 0 ]; then model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
echo ""
echo "lmdeploy server is up and running."
else
echo ""
echo "lmdeploy failed to start within the timeout period."
break
fi fi
# get model name
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
# iterate over different QPS # iterate over different QPS
for qps in $qps_list; do for qps in $qps_list; do
# remove the surrounding single quote from qps # remove the surrounding single quote from qps
...@@ -140,31 +213,79 @@ run_serving_tests() { ...@@ -140,31 +213,79 @@ run_serving_tests() {
new_test_name=$test_name"_qps_"$qps new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \ backend=$CURRENT_LLM_SERVING_ENGINE
--backend lmdeploy \
--tokenizer /tokenizer_cache \ if [[ $backend = "trt" ]]; then
--dataset-name $dataset_name \ backend="tensorrt-llm"
--dataset-path $dataset_path \ fi
--num-prompts $num_prompts \
--port $port \ if [[ "$backend" == *"vllm"* ]]; then
--save-result \ backend="vllm"
--result-dir $RESULTS_FOLDER \ fi
--result-filename ${new_test_name}.json \
--request-rate $qps \ if [[ "$dataset_name" = "sharegpt" ]]; then
--model \"$model_name\" \
$client_args" client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
elif [[ "$dataset_name" = "sonnet" ]]; then
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--sonnet-input-len $sonnet_input_len \
--sonnet-output-len $sonnet_output_len \
--sonnet-prefix-len $sonnet_prefix_len \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
eval "$client_command" eval "$client_command"
server_command="None"
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
--arg server "$server_command" \ --arg server "$server_command" \
--arg client "$client_command" \ --arg client "$client_command" \
--arg gpu "$gpu_type" \ --arg gpu "$gpu_type" \
--arg engine "lmdeploy" \ --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
'{ '{
server_command: $server, server_command: $server,
client_command: $client, client_command: $client,
...@@ -175,42 +296,58 @@ run_serving_tests() { ...@@ -175,42 +296,58 @@ run_serving_tests() {
done done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done done
kill_gpu_processes
} }
upload_to_buildkite() { prepare_dataset() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0 # download sharegpt dataset
if [ ! -f /workspace/buildkite-agent ]; then cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
echo "buildkite-agent binary not found. Skip uploading the results." wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
echo "" > sonnet_4x.txt
for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() { main() {
# check if the environment variable is successfully injected from yaml
check_gpus check_gpus
# enter vllm directory check_hf_token
cd $VLLM_SOURCE_CODE_LOC/benchmarks get_current_llm_serving_engine
pip install -U transformers
# check storage
df -h
ensure_installed wget
ensure_installed curl
ensure_installed jq
prepare_dataset
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
declare -g RESULTS_FOLDER=results/ declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
python -m pip install transformers==4.41.2 # run the test
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
export CURRENT_LLM_SERVING_ENGINE=lmdeploy # upload benchmark results to buildkite
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json python3 -m pip install tabulate pandas
python -m pip install tabulate pandas python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite upload_to_buildkite
} }
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes. # and we still want to see other benchmarking results even when mixtral crashes.
set -x
set -o pipefail set -o pipefail
check_gpus() { check_gpus() {
...@@ -17,7 +18,7 @@ check_gpus() { ...@@ -17,7 +18,7 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking." echo "Need at least 1 GPU to run benchmarking."
exit 1 exit 1
fi fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
...@@ -85,15 +86,11 @@ kill_gpu_processes() { ...@@ -85,15 +86,11 @@ kill_gpu_processes() {
ps -aux ps -aux
lsof -t -i:8000 | xargs -r kill -9 lsof -t -i:8000 | xargs -r kill -9
pkill -f pt_main_thread pgrep python3 | xargs -r kill -9
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill -f python3
pkill -f /usr/bin/python3
# wait until GPU memory usage smaller than 1GB # wait until GPU memory usage smaller than 1GB
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1 sleep 1
done done
...@@ -117,7 +114,7 @@ upload_to_buildkite() { ...@@ -117,7 +114,7 @@ upload_to_buildkite() {
fi fi
# Use the determined command to annotate and upload artifacts # Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
...@@ -150,7 +147,7 @@ run_latency_tests() { ...@@ -150,7 +147,7 @@ run_latency_tests() {
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue continue
fi fi
...@@ -206,9 +203,9 @@ run_throughput_tests() { ...@@ -206,9 +203,9 @@ run_throughput_tests() {
throughput_args=$(json2args "$throughput_params") throughput_args=$(json2args "$throughput_params")
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size') tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue continue
fi fi
...@@ -270,7 +267,7 @@ run_serving_tests() { ...@@ -270,7 +267,7 @@ run_serving_tests() {
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue continue
fi fi
...@@ -278,7 +275,7 @@ run_serving_tests() { ...@@ -278,7 +275,7 @@ run_serving_tests() {
server_model=$(echo "$server_params" | jq -r '.model') server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $testname." echo "Server model and client model must be the same. Skip testcase $test_name."
continue continue
fi fi
...@@ -289,12 +286,11 @@ run_serving_tests() { ...@@ -289,12 +286,11 @@ run_serving_tests() {
# run the server # run the server
echo "Running test case $test_name" echo "Running test case $test_name"
echo "Server command: $server_command" echo "Server command: $server_command"
eval "$server_command" & bash -c "$server_command" &
server_pid=$! server_pid=$!
# wait until the server is alive # wait until the server is alive
wait_for_server if wait_for_server; then
if [ $? -eq 0 ]; then
echo "" echo ""
echo "vllm server is up and running." echo "vllm server is up and running."
else else
...@@ -323,7 +319,7 @@ run_serving_tests() { ...@@ -323,7 +319,7 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
eval "$client_command" bash -c "$client_command"
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
......
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill text-generation || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append tgi to the test name
test_name=tgi_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "tgi server is up and running."
else
echo ""
echo "tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tgi \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "tgi" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=tgi
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment