Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
import os import os
import sys
import zipfile import zipfile
MAX_SIZE_MB = 250 # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):
"""Print the top 10 largest files in the given zip file."""
with zipfile.ZipFile(zip_file, 'r') as z: with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True) file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]: for f, size in file_sizes[:10]:
print(f"{f}: {size/(1024*1024)} MBs uncompressed.") print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
def check_wheel_size(directory): def check_wheel_size(directory):
"""Check the size of .whl files in the given directory."""
for root, _, files in os.walk(directory): for root, _, files in os.walk(directory):
for f in files: for file_name in files:
if f.endswith(".whl"): if file_name.endswith(".whl"):
wheel_path = os.path.join(root, f) wheel_path = os.path.join(root, file_name)
wheel_size = os.path.getsize(wheel_path) wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
wheel_size_mb = wheel_size / (1024 * 1024) if wheel_size_mb > VLLM_MAX_SIZE_MB:
if wheel_size_mb > MAX_SIZE_MB: print(f"Not allowed: Wheel {wheel_path} is larger "
print( f"({wheel_size_mb:.2f} MB) than the limit "
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) " f"({VLLM_MAX_SIZE_MB} MB).")
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
print_top_10_largest_files(wheel_path) print_top_10_largest_files(wheel_path)
return 1 return 1
else: else:
print(f"Wheel {wheel_path} is within the allowed size " print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb} MB).") f"({wheel_size_mb:.2f} MB).")
return 0 return 0
if __name__ == "__main__": if __name__ == "__main__":
import sys if len(sys.argv) < 2:
sys.exit(check_wheel_size(sys.argv[1])) print("Usage: python check-wheel-size.py <directory>")
sys.exit(1)
directory = sys.argv[1]
sys.exit(check_wheel_size(directory))
\ No newline at end of file
...@@ -9,3 +9,4 @@ tasks: ...@@ -9,3 +9,4 @@ tasks:
value: 0.664 value: 0.664
limit: 1000 limit: 1000
num_fewshot: 5 num_fewshot: 5
trust_remote_code: True
\ No newline at end of file
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.764
- name: "exact_match,flexible-extract"
value: 0.764
limit: 250
num_fewshot: 5
...@@ -4,8 +4,8 @@ tasks: ...@@ -4,8 +4,8 @@ tasks:
- name: "gsm8k" - name: "gsm8k"
metrics: metrics:
- name: "exact_match,strict-match" - name: "exact_match,strict-match"
value: 0.409 value: 0.419
- name: "exact_match,flexible-extract" - name: "exact_match,flexible-extract"
value: 0.406 value: 0.416
limit: 1000 limit: 1000
num_fewshot: 5 num_fewshot: 5
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
model_name: "nvidia/Minitron-4B-Base" model_name: "mgoin/Minitron-4B-Base-FP8"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"
metrics: metrics:
- name: "exact_match,strict-match" - name: "exact_match,strict-match"
value: 0.252 value: 0.233
- name: "exact_match,flexible-extract" - name: "exact_match,flexible-extract"
value: 0.252 value: 0.236
limit: 1000 limit: 1000
num_fewshot: 5 num_fewshot: 5
Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base.yaml Minitron-4B-Base-FP8.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml Meta-Llama-3-8B-QQQ.yaml
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 # pip install lm-eval==0.4.4
usage() { usage() {
echo`` echo``
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.3 # pip install lm-eval==0.4.4
usage() { usage() {
echo`` echo``
......
...@@ -14,7 +14,7 @@ import lm_eval ...@@ -14,7 +14,7 @@ import lm_eval
import numpy import numpy
import yaml import yaml
RTOL = 0.02 RTOL = 0.05
TEST_DATA_FILE = os.environ.get( TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE", "LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
...@@ -23,9 +23,12 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) ...@@ -23,9 +23,12 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
def launch_lm_eval(eval_config): def launch_lm_eval(eval_config):
trust_remote_code = eval_config.get('trust_remote_code', False)
model_args = f"pretrained={eval_config['model_name']}," \ model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={TP_SIZE}," \ f"tensor_parallel_size={TP_SIZE}," \
f"add_bos_token=true" f"add_bos_token=true," \
f"trust_remote_code={trust_remote_code}"
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model="vllm",
...@@ -46,10 +49,15 @@ def test_lm_eval_correctness(): ...@@ -46,10 +49,15 @@ def test_lm_eval_correctness():
results = launch_lm_eval(eval_config) results = launch_lm_eval(eval_config)
# Confirm scores match ground truth. # Confirm scores match ground truth.
success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
ground_truth = metric["value"] ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: ' print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}') f'ground_truth={ground_truth} | measured={measured_value}')
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) success = success and numpy.isclose(
ground_truth, measured_value, rtol=RTOL)
# Assert at the end, print all scores even on failure for debugging.
assert success
...@@ -34,17 +34,18 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan ...@@ -34,17 +34,18 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
Performance benchmark will be triggered when: Performance benchmark will be triggered when:
- A PR being merged into vllm. - A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label. - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
Nightly benchmark will be triggered when: Nightly benchmark will be triggered when:
- Every commit for those PRs with `nightly-benchmarks` label. - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details ## Performance benchmark details
See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
#### Latency test #### Latency test
...@@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`: ...@@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example: In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` - The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
......
...@@ -8,8 +8,7 @@ steps: ...@@ -8,8 +8,7 @@ steps:
containers: containers:
- image: badouralix/curl-jq - image: badouralix/curl-jq
command: command:
- sh - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait - wait
- label: "A100" - label: "A100"
agents: agents:
...@@ -21,7 +20,7 @@ steps: ...@@ -21,7 +20,7 @@ steps:
containers: containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command: command:
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources: resources:
limits: limits:
nvidia.com/gpu: 8 nvidia.com/gpu: 8
......
## Description
This file contains the downloading link for benchmarking results.
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
Please download the visualization scripts in the post
## Results reproduction
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code
```
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.
# Nightly benchmark # Nightly benchmark
The main goal of this benchmarking is two-fold: This benchmark aims to:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload. - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
## Docker images
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 ## Setup
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1 - Docker images:
- vLLM: `vllm/vllm-openai:v0.6.2`
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. --> - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
## Hardware - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
One AWS node with 8x NVIDIA A100 GPUs. - Hardware
- 8x Nvidia A100 GPUs
- Workload:
## Workload description - Dataset
- ShareGPT dataset
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed). - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Output length: the corresponding output length of these 500 prompts. - Models: llama-3 8B, llama-3 70B.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
# Known issues
## Plots
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed. - TGI does not support `ignore-eos` flag.
\ No newline at end of file
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
## Results
{nightly_results_benchmarking_table}
...@@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec ...@@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec
common_container_settings: &common_container_settings common_container_settings: &common_container_settings
command: command:
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
resources: resources:
limits: limits:
nvidia.com/gpu: 8 nvidia.com/gpu: 8
...@@ -37,7 +37,24 @@ common_container_settings: &common_container_settings ...@@ -37,7 +37,24 @@ common_container_settings: &common_container_settings
steps: steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
- label: "A100 trt benchmark"
- label: "A100 vllm step 10"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:v0.6.2
<<: *common_container_settings
- label: "A100 sglang benchmark"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -46,7 +63,7 @@ steps: ...@@ -46,7 +63,7 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - image: lmsysorg/sglang:v0.3.2-cu121
<<: *common_container_settings <<: *common_container_settings
- label: "A100 lmdeploy benchmark" - label: "A100 lmdeploy benchmark"
...@@ -58,11 +75,13 @@ steps: ...@@ -58,11 +75,13 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: openmmlab/lmdeploy:v0.5.0 - image: openmmlab/lmdeploy:v0.6.1-cu12
<<: *common_container_settings <<: *common_container_settings
- label: "A100 vllm benchmark"
- label: "A100 trt llama-8B"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -71,10 +90,25 @@ steps: ...@@ -71,10 +90,25 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: vllm/vllm-openai:latest - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings <<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama8B"
- label: "A100 tgi benchmark" - label: "A100 trt llama-70B"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -83,12 +117,54 @@ steps: ...@@ -83,12 +117,54 @@ steps:
podSpec: podSpec:
<<: *common_pod_spec <<: *common_pod_spec
containers: containers:
- image: ghcr.io/huggingface/text-generation-inference:2.1 - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings <<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama70B"
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
# - label: "A100 trt benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
# <<: *common_container_settings
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
# - label: "A100 tgi benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
# <<: *common_container_settings
- wait - wait
- label: "Plot" - label: "Collect the results"
priority: 100 priority: 100
agents: agents:
queue: A100 queue: A100
...@@ -117,4 +193,4 @@ steps: ...@@ -117,4 +193,4 @@ steps:
name: hf-token-secret name: hf-token-secret
key: token key: token
- wait - block: ":rocket: check the results!"
\ No newline at end of file \ No newline at end of file
## Latency tests ## Latency tests
This test suite aims to test vllm's end-to-end latency under a controlled setup.
- Input length: 32 tokens. - Input length: 32 tokens.
- Output length: 128 tokens. - Output length: 128 tokens.
- Batch size: fixed (8). - Batch size: fixed (8).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99). - Evaluation metrics: end-to-end latency (mean, median, p99).
### Latency benchmarking results
{latency_tests_markdown_table} {latency_tests_markdown_table}
## Throughput tests
This test suite aims to test vllm's throughput. ## Throughput tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts. - Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput. - Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput. - Evaluation metrics: throughput.
### Throughput benchmarking results
{throughput_tests_markdown_table} {throughput_tests_markdown_table}
## Serving tests
This test suite aims to test vllm's real serving metrics. ## Serving tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts. - Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests. - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B, under QPS 2
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
### Serving benchmarking results
{serving_tests_markdown_table} {serving_tests_markdown_table}
## json version of the benchmarking tables ## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format. This section contains the data of the markdown tables above in JSON format.
......
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
main() {
check_gpus
check_hf_token
df -h
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
cd $VLLM_SOURCE_CODE_LOC/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# run lmdeploy
if which lmdeploy >/dev/null; then
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
exit 0
fi
# run tgi
if [ -e /tgi-entrypoint.sh ]; then
echo "tgi is available, redirect to run-tgi-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
exit 0
fi
# run trt
if which trtllm-build >/dev/null; then
echo "trtllm is available, redirect to run-trt-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
exit 0
fi
# run vllm
if [ -e /vllm-workspace ]; then
echo "vllm is available, redirect to run-vllm-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
exit 0
fi
}
main "$@"
\ No newline at end of file
...@@ -174,8 +174,8 @@ if __name__ == "__main__": ...@@ -174,8 +174,8 @@ if __name__ == "__main__":
# document the result # document the result
with open(results_folder / "benchmark_results.md", "w") as f: with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown( results = read_markdown("../.buildkite/nightly-benchmarks/" +
"../.buildkite/nightly-benchmarks/tests/descriptions.md") "performance-benchmarks-descriptions.md")
results = results.format( results = results.format(
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table, throughput_tests_markdown_table=throughput_md_table,
......
import argparse import argparse
import json import json
import math
from pathlib import Path from pathlib import Path
import matplotlib.pyplot as plt import numpy as np
import pandas as pd import pandas as pd
from tabulate import tabulate from tabulate import tabulate
...@@ -25,8 +24,48 @@ def parse_arguments(): ...@@ -25,8 +24,48 @@ def parse_arguments():
return args return args
def get_perf(df, method, model, metric):
means = []
for qps in [2, 4, 8, 16, "inf"]:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
target = target & df['Test name'].str.contains("qps_" + str(qps))
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
else:
means.append(filtered_df[metric].values[0])
return np.array(means)
def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist()
std = get_perf(df, method, model, "Std " + metric + " (ms)")
if std.mean() == 0:
std = None
success = get_perf(df, method, model, "Successful req.")
if std is not None:
std = std / np.sqrt(success)
std = std.tolist()
else:
assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)")
mean = mean.tolist()
std = None
return mean, std
def main(args): def main(args):
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
results_folder = Path(args.results_folder) results_folder = Path(args.results_folder)
results = [] results = []
...@@ -50,85 +89,6 @@ def main(args): ...@@ -50,85 +89,6 @@ def main(args):
with open("nightly_results.md", "w") as f: with open("nightly_results.md", "w") as f:
f.write(description) f.write(description)
plt.rcParams.update({'font.size': 20})
# plot results
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
fig.subplots_adjust(hspace=1)
methods = ["vllm", "trt", "lmdeploy", "tgi"]
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
for j, metric in enumerate(["TTFT", "ITL"]):
means, stds = [], []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
stds.append(0.)
else:
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
std = filtered_df[f"Std {metric} (ms)"].values[0]
success = filtered_df["Successful req."].values[0]
stds.append(std / math.sqrt(success))
print(model, metric)
print(means, stds)
ax = axes[i, j + 1]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
means,
yerr=stds,
capsize=10,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel(f"{metric} (ms)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
metric = "Tput"
j = 0
if True:
tputs = []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
tputs.append(0.)
else:
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
tputs.append(input_tput + output_tput)
print(model, metric)
print(tputs)
ax = axes[i, j]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
tputs,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel("Tput (token/s)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
fig.tight_layout()
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
if __name__ == '__main__': if __name__ == '__main__':
args = parse_arguments() args = parse_arguments()
......
#!/bin/bash
# Currently FP8 benchmark is NOT enabled.
set -x
server_params=$1
common_params=$2
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
launch_trt_server() {
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
# create model caching directory
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
# clone tensorrt backend
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
# build trtllm engine
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
trtllm-build \
--checkpoint_dir ${trt_model_path} \
--use_fused_mlp \
--reduce_fusion disable \
--workers 8 \
--gpt_attention_plugin ${model_dtype} \
--gemm_plugin ${model_dtype} \
--tp_size ${model_tp_size} \
--max_batch_size ${max_batch_size} \
--max_input_len ${max_input_len} \
--max_seq_len ${max_seq_len} \
--max_num_tokens ${max_num_tokens} \
--output_dir ${trt_engine_path}
# handle triton protobuf files and launch triton server
cd /tensorrtllm_backend
mkdir triton_model_repo
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
cd triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &
}
launch_tgi_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
echo "Server command: $server_command"
eval "$server_command" &
}
launch_lmdeploy_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Server command: $server_command"
bash -c "$server_command" &
}
launch_sglang_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
fi
# run the server
echo "Server command: $server_command"
eval "$server_command" &
}
launch_vllm_server() {
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Server command: $server_command"
eval "$server_command" &
}
main() {
if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
launch_trt_server
fi
if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
launch_tgi_server
fi
if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
launch_lmdeploy_server
fi
if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
launch_sglang_server
fi
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
launch_vllm_server
fi
}
main
#!/bin/bash
server_params=$1
common_params=$2
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
cd ~
rm -rf tensorrt-demo
git clone https://github.com/neuralmagic/tensorrt-demo.git
cd tensorrt-demo
tensorrt_demo_dir=$(pwd)
# make sure the parameter inside tensorrt_demo is consistent to envvar
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
python ../quantization/quantize.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path} \
--qformat fp8 \
--kv_cache_dtype fp8 \
--calib_size 2
else
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
fi
trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path}
cd /tensorrtllm_backend/triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment