Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
006693ed
Commit
006693ed
authored
Dec 01, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.11.2' into v0.11.2-ori
parents
4b51e6f1
275de341
Changes
1000
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
171 additions
and
581 deletions
+171
-581
.buildkite/check-wheel-size.py
.buildkite/check-wheel-size.py
+2
-2
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+4
-4
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
...nfigs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+12
-0
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
.../configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+10
-0
.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...l-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+2
-1
.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
...dkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+12
-0
.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
...al-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+14
-0
.buildkite/lm-eval-harness/configs/models-large-hopper.txt
.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+1
-0
.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+1
-0
.buildkite/lm-eval-harness/configs/models-mm-small.txt
.buildkite/lm-eval-harness/configs/models-mm-small.txt
+1
-0
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
.../lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+44
-0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+0
-0
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
...kite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+50
-0
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+18
-4
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+0
-184
.buildkite/nightly-benchmarks/nightly-annotation.md
.buildkite/nightly-benchmarks/nightly-annotation.md
+0
-28
.buildkite/nightly-benchmarks/nightly-descriptions.md
.buildkite/nightly-benchmarks/nightly-descriptions.md
+0
-39
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+0
-196
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+0
-26
.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
...e/nightly-benchmarks/scripts/generate-nightly-markdown.py
+0
-97
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
.buildkite/check-wheel-size.py
View file @
006693ed
...
...
@@ -5,11 +5,11 @@ import os
import
sys
import
zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to
4
50 MiB
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 5
0
0 MiB
# Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB
=
int
(
os
.
environ
.
get
(
"VLLM_MAX_SIZE_MB"
,
4
50
))
VLLM_MAX_SIZE_MB
=
int
(
os
.
environ
.
get
(
"VLLM_MAX_SIZE_MB"
,
5
0
0
))
def
print_top_10_largest_files
(
zip_file
):
...
...
.buildkite/lm-eval-harness/configs/
Qwen2-1.5B-Instruct-W8A16-compressed-tensors
.yaml
→
.buildkite/lm-eval-harness/configs/
Meta-Llama-3-8B-QQQ
.yaml
View file @
006693ed
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m
nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto"
-l 1000 -f 5 -t 1
model_name
:
"
nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise
"
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m
HandH1998/QQQ-Llama-3-8b-g128 -b 32
-l 1000 -f 5 -t 1
model_name
:
"
HandH1998/QQQ-Llama-3-8b-g128
"
tasks
:
-
name
:
"
gsm8k"
metrics
:
-
name
:
"
exact_match,strict-match"
value
:
0.
595
value
:
0.
419
-
name
:
"
exact_match,flexible-extract"
value
:
0.
582
value
:
0.
416
limit
:
1000
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
0 → 100644
View file @
006693ed
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
model_name
:
"
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend
:
"
vllm-vlm"
tasks
:
-
name
:
"
chartqa"
metrics
:
-
name
:
"
relaxed_accuracy,none"
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
value
:
0.80
limit
:
100
num_fewshot
:
0
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
0 → 100644
View file @
006693ed
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
model_name
:
"
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
tasks
:
-
name
:
"
mmlu_pro"
metrics
:
-
name
:
"
exact_match,custom-extract"
value
:
0.80
limit
:
250
# will run on 250 * 14 subjects = 3500 samples
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
View file @
006693ed
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
# For vllm script, with -t option (tensor parallel size)
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name
:
"
RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks
:
-
name
:
"
gsm8k"
...
...
.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
0 → 100644
View file @
006693ed
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name
:
"
Qwen/Qwen2.5-VL-7B-Instruct"
backend
:
"
vllm-vlm"
tasks
:
-
name
:
"
chartqa"
metrics
:
-
name
:
"
relaxed_accuracy,none"
value
:
0.855
limit
:
2500
num_fewshot
:
0
.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
0 → 100644
View file @
006693ed
model_name
:
"
Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
tasks
:
-
name
:
"
mmlu_pro"
metrics
:
-
name
:
"
exact_match,custom-extract"
value
:
0.82
limit
:
250
# will run on 250 * 14 subjects = 3500 samples
num_fewshot
:
5
enforce_eager
:
false
# we use false to speed up the eval process
kv_cache_dtype
:
fp8
# we use fp8 to speed up the eval process
max_model_len
:
40960
apply_chat_template
:
true
fewshot_as_multiturn
:
true
gen_kwargs
:
"
temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
.buildkite/lm-eval-harness/configs/models-large-hopper.txt
0 → 100644
View file @
006693ed
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
0 → 100644
View file @
006693ed
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
.buildkite/lm-eval-harness/configs/models-mm-small.txt
0 → 100644
View file @
006693ed
Qwen2.5-VL-7B-Instruct.yaml
\ No newline at end of file
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
0 → 100755
View file @
006693ed
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage
()
{
echo
``
echo
"Runs lm eval harness on ChartQA using multimodal vllm."
echo
"This pathway is intended to be used to create baselines for "
echo
"our correctness tests in vllm's CI."
echo
echo
"usage:
${
0
}
<options>"
echo
echo
" -m - huggingface stub or local directory of the model"
echo
" -l - limit number of samples to run"
echo
" -t - tensor parallel size to run at"
echo
}
while
getopts
"m:l:t:"
OPT
;
do
case
${
OPT
}
in
m
)
MODEL
=
"
$OPTARG
"
;;
l
)
LIMIT
=
"
$OPTARG
"
;;
t
)
TP_SIZE
=
"
$OPTARG
"
;;
\?
)
usage
exit
1
;;
esac
done
lm_eval
--model
vllm-vlm
\
--model_args
"pretrained=
$MODEL
,tensor_parallel_size=
$TP_SIZE
"
\
--tasks
chartqa
\
--batch_size
auto
\
--apply_chat_template
\
--limit
$LIMIT
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
100644 → 100755
View file @
006693ed
File mode changed from 100644 to 100755
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
0 → 100644
View file @
006693ed
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage
()
{
echo
``
echo
"Runs lm eval harness on MMLU Pro using huggingface transformers."
echo
"This pathway is intended to be used to create baselines for "
echo
"our automated nm-test-accuracy workflow"
echo
echo
"usage:
${
0
}
<options>"
echo
echo
" -m - huggingface stub or local directory of the model"
echo
" -l - limit number of samples to run"
echo
" -f - number of fewshot samples to use"
echo
" -t - tensor parallel size to run at"
echo
}
while
getopts
"m:b:l:f:t:"
OPT
;
do
case
${
OPT
}
in
m
)
MODEL
=
"
$OPTARG
"
;;
b
)
BATCH_SIZE
=
"
$OPTARG
"
;;
l
)
LIMIT
=
"
$OPTARG
"
;;
f
)
FEWSHOT
=
"
$OPTARG
"
;;
t
)
TP_SIZE
=
"
$OPTARG
"
;;
\?
)
usage
exit
1
;;
esac
done
lm_eval
--model
vllm
\
--model_args
"pretrained=
$MODEL
,tensor_parallel_size=
$TP_SIZE
,add_bos_token=true,trust_remote_code=true,max_model_len=4096"
\
--tasks
mmlu_pro
--num_fewshot
"
$FEWSHOT
"
--limit
"
$LIMIT
"
\
--batch_size
auto
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
006693ed
...
...
@@ -19,21 +19,35 @@ RTOL = 0.08
def
launch_lm_eval
(
eval_config
,
tp_size
):
trust_remote_code
=
eval_config
.
get
(
"trust_remote_code"
,
False
)
max_model_len
=
eval_config
.
get
(
"max_model_len"
,
4096
)
batch_size
=
eval_config
.
get
(
"batch_size"
,
"auto"
)
backend
=
eval_config
.
get
(
"backend"
,
"vllm"
)
enforce_eager
=
eval_config
.
get
(
"enforce_eager"
,
"true"
)
kv_cache_dtype
=
eval_config
.
get
(
"kv_cache_dtype"
,
"auto"
)
model_args
=
(
f
"pretrained=
{
eval_config
[
'model_name'
]
}
,"
f
"tensor_parallel_size=
{
tp_size
}
,"
f
"enforce_eager=true,"
f
"enforce_eager=
{
enforce_eager
}
,"
f
"kv_cache_dtype=
{
kv_cache_dtype
}
,"
f
"add_bos_token=true,"
f
"trust_remote_code=
{
trust_remote_code
}
,"
f
"max_model_len=
{
max_model_len
}
"
f
"max_model_len=
{
max_model_len
}
,
"
)
results
=
lm_eval
.
simple_evaluate
(
model
=
"vllm"
,
model
=
backend
,
model_args
=
model_args
,
tasks
=
[
task
[
"name"
]
for
task
in
eval_config
[
"tasks"
]],
num_fewshot
=
eval_config
[
"num_fewshot"
],
limit
=
eval_config
[
"limit"
],
batch_size
=
"auto"
,
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm, or explicitly set
apply_chat_template
=
eval_config
.
get
(
"apply_chat_template"
,
backend
==
"vllm-vlm"
),
fewshot_as_multiturn
=
eval_config
.
get
(
"fewshot_as_multiturn"
,
False
),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs
=
eval_config
.
get
(
"gen_kwargs"
),
batch_size
=
batch_size
,
)
return
results
...
...
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
deleted
100644 → 0
View file @
4b51e6f1
steps
:
-
label
:
"
Wait
for
container
to
be
ready"
key
:
wait-for-container-image
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
containers
:
-
image
:
badouralix/curl-jq
command
:
-
sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-
label
:
"
Cleanup
H100"
agents
:
queue
:
H100
depends_on
:
~
command
:
docker system prune -a --volumes --force
-
label
:
"
A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents
:
queue
:
A100
depends_on
:
wait-for-container-image
if
:
build.branch == "main"
plugins
:
-
kubernetes
:
podSpec
:
priorityClassName
:
perf-benchmark
containers
:
-
image
:
public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command
:
-
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources
:
limits
:
nvidia.com/gpu
:
8
volumeMounts
:
-
name
:
devshm
mountPath
:
/dev/shm
env
:
-
name
:
VLLM_USAGE_SOURCE
value
:
ci-test
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
token
nodeSelector
:
nvidia.com/gpu.product
:
NVIDIA-A100-SXM4-80GB
volumes
:
-
name
:
devshm
emptyDir
:
medium
:
Memory
-
label
:
"
H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents
:
queue
:
H200
depends_on
:
wait-for-container-image
if
:
build.branch == "main"
plugins
:
-
docker#v5.12.0
:
image
:
public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command
:
-
bash
-
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent
:
true
propagate-environment
:
true
ipc
:
host
gpus
:
4,5,6,7
volumes
:
-
/data/benchmark-hf-cache:/root/.cache/huggingface
environment
:
-
VLLM_USAGE_SOURCE
-
HF_TOKEN
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~
-
label
:
"
H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents
:
queue
:
H100
depends_on
:
wait-for-container-image
if
:
build.branch == "main"
plugins
:
-
docker#v5.12.0
:
image
:
public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command
:
-
bash
-
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent
:
true
propagate-environment
:
true
ipc
:
host
gpus
:
all
# see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes
:
-
/data/benchmark-hf-cache:/root/.cache/huggingface
environment
:
-
VLLM_USAGE_SOURCE
-
HF_TOKEN
# Premerge benchmark
-
label
:
"
A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents
:
queue
:
A100
depends_on
:
wait-for-container-image
if
:
build.branch != "main"
plugins
:
-
kubernetes
:
podSpec
:
priorityClassName
:
perf-benchmark
containers
:
-
image
:
public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command
:
-
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources
:
limits
:
nvidia.com/gpu
:
8
volumeMounts
:
-
name
:
devshm
mountPath
:
/dev/shm
env
:
-
name
:
VLLM_USAGE_SOURCE
value
:
ci-test
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
token
nodeSelector
:
nvidia.com/gpu.product
:
NVIDIA-A100-SXM4-80GB
volumes
:
-
name
:
devshm
emptyDir
:
medium
:
Memory
-
label
:
"
H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents
:
queue
:
H200
depends_on
:
wait-for-container-image
if
:
build.branch != "main"
plugins
:
-
docker#v5.12.0
:
image
:
public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command
:
-
bash
-
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent
:
true
propagate-environment
:
true
ipc
:
host
gpus
:
4,5,6,7
volumes
:
-
/data/benchmark-hf-cache:/root/.cache/huggingface
environment
:
-
VLLM_USAGE_SOURCE
-
HF_TOKEN
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~
-
label
:
"
H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents
:
queue
:
H100
depends_on
:
wait-for-container-image
if
:
build.branch != "main"
plugins
:
-
docker#v5.12.0
:
image
:
public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command
:
-
bash
-
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent
:
true
propagate-environment
:
true
ipc
:
host
gpus
:
all
# see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes
:
-
/data/benchmark-hf-cache:/root/.cache/huggingface
environment
:
-
VLLM_USAGE_SOURCE
-
HF_TOKEN
.buildkite/nightly-benchmarks/nightly-annotation.md
deleted
100644 → 0
View file @
4b51e6f1
# Nightly benchmark annotation
## Description
This file contains the downloading link for benchmarking results.
-
[
benchmarking pipeline
](
artifact://nightly-pipeline.yaml
)
-
[
benchmarking results
](
artifact://results.zip
)
-
[
benchmarking code
](
artifact://nightly-benchmarks.zip
)
Please download the visualization scripts in the post
## Results reproduction
-
Find the docker we use in
`benchmarking pipeline`
-
Deploy the docker, and inside the docker:
-
Download
`nightly-benchmarks.zip`
.
-
In the same folder, run the following code:
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside
`./benchmarks/results`
.
.buildkite/nightly-benchmarks/nightly-descriptions.md
deleted
100644 → 0
View file @
4b51e6f1
# Nightly benchmark
This benchmark aims to:
-
Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
-
Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
Latest results:
[
results link
](
https://blog.vllm.ai/2024/09/05/perf-update.html
)
, scroll to the end.
Latest reproduction guide:
[
github issue link
](
https://github.com/vllm-project/vllm/issues/8176
)
## Setup
-
Docker images:
-
vLLM:
`vllm/vllm-openai:v0.6.2`
-
SGLang:
`lmsysorg/sglang:v0.3.2-cu121`
-
LMDeploy:
`openmmlab/lmdeploy:v0.6.1-cu12`
-
TensorRT-LLM:
`nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-
*NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
-
Check
[
nightly-pipeline.yaml
](
nightly-pipeline.yaml
)
for the concrete docker images, specs and commands we use for the benchmark.
-
Hardware
-
8x Nvidia A100 GPUs
-
Workload:
-
Dataset
-
ShareGPT dataset
-
Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-
Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-
Check
[
nightly-tests.json
](
tests/nightly-tests.json
)
for the concrete configuration of datasets we use.
-
Models: llama-3 8B, llama-3 70B.
-
We do not use llama 3.1 as it is incompatible with trt-llm r24.07. (
[
issue
](
https://github.com/NVIDIA/TensorRT-LLM/issues/2105
)
).
-
Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-
Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-
Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
## Known issues
-
TRT-LLM crashes with Llama 3.1 8B
[
issue
](
https://github.com/NVIDIA/TensorRT-LLM/issues/2105
)
.
-
TGI does not support
`ignore-eos`
flag.
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
deleted
100644 → 0
View file @
4b51e6f1
common_pod_spec
:
&common_pod_spec
priorityClassName
:
perf-benchmark
nodeSelector
:
nvidia.com/gpu.product
:
NVIDIA-A100-SXM4-80GB
volumes
:
-
name
:
devshm
emptyDir
:
medium
:
Memory
-
name
:
hf-cache
hostPath
:
path
:
/root/.cache/huggingface
type
:
Directory
common_container_settings
:
&common_container_settings
command
:
-
bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
resources
:
limits
:
nvidia.com/gpu
:
8
volumeMounts
:
-
name
:
devshm
mountPath
:
/dev/shm
-
name
:
hf-cache
mountPath
:
/root/.cache/huggingface
env
:
-
name
:
VLLM_USAGE_SOURCE
value
:
ci-test
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
VLLM_SOURCE_CODE_LOC
value
:
/workspace/build/buildkite/vllm/performance-benchmark
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
token
steps
:
-
block
:
"
:rocket:
Ready
for
comparing
vllm
against
alternatives?
This
will
take
4
hours."
-
label
:
"
A100
vllm
step
10"
priority
:
100
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
<<
:
*common_pod_spec
containers
:
-
image
:
vllm/vllm-openai:v0.6.2
<<
:
*common_container_settings
-
label
:
"
A100
sglang
benchmark"
priority
:
100
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
<<
:
*common_pod_spec
containers
:
-
image
:
lmsysorg/sglang:v0.3.2-cu121
<<
:
*common_container_settings
-
label
:
"
A100
lmdeploy
benchmark"
priority
:
100
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
<<
:
*common_pod_spec
containers
:
-
image
:
openmmlab/lmdeploy:v0.6.1-cu12
<<
:
*common_container_settings
-
label
:
"
A100
trt
llama-8B"
priority
:
100
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
<<
:
*common_pod_spec
containers
:
-
image
:
nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<
:
*common_container_settings
env
:
-
name
:
VLLM_USAGE_SOURCE
value
:
ci-test
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
VLLM_SOURCE_CODE_LOC
value
:
/workspace/build/buildkite/vllm/performance-benchmark
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
token
-
name
:
TEST_SELECTOR
value
:
"
llama8B"
-
label
:
"
A100
trt
llama-70B"
priority
:
100
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
<<
:
*common_pod_spec
containers
:
-
image
:
nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<
:
*common_container_settings
env
:
-
name
:
VLLM_USAGE_SOURCE
value
:
ci-test
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
VLLM_SOURCE_CODE_LOC
value
:
/workspace/build/buildkite/vllm/performance-benchmark
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
token
-
name
:
TEST_SELECTOR
value
:
"
llama70B"
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
# - label: "A100 trt benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
# <<: *common_container_settings
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
# - label: "A100 tgi benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
# <<: *common_container_settings
-
wait
-
label
:
"
Collect
the
results"
priority
:
100
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
<<
:
*common_pod_spec
containers
:
-
image
:
vllm/vllm-openai:v0.5.0.post1
command
:
-
bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
resources
:
limits
:
nvidia.com/gpu
:
8
volumeMounts
:
-
name
:
devshm
mountPath
:
/dev/shm
env
:
-
name
:
VLLM_USAGE_SOURCE
value
:
ci-test
-
name
:
VLLM_SOURCE_CODE_LOC
value
:
/workspace/build/buildkite/vllm/performance-benchmark
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
token
-
block
:
"
:rocket:
check
the
results!"
\ No newline at end of file
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
deleted
100644 → 0
View file @
4b51e6f1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
from
transformers
import
AutoTokenizer
def
main
(
model
,
cachedir
):
# Load the tokenizer and save it to the specified directory
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
tokenizer
.
save_pretrained
(
cachedir
)
print
(
f
"Tokenizer saved to
{
cachedir
}
"
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Download and save Hugging Face tokenizer"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
help
=
"Name of the model"
)
parser
.
add_argument
(
"--cachedir"
,
type
=
str
,
required
=
True
,
help
=
"Directory to save the tokenizer"
)
args
=
parser
.
parse_args
()
main
(
args
.
model
,
args
.
cachedir
)
.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
deleted
100644 → 0
View file @
4b51e6f1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
from
pathlib
import
Path
import
numpy
as
np
import
pandas
as
pd
from
tabulate
import
tabulate
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Parse command line arguments for summary-nightly-results script."
)
parser
.
add_argument
(
"--results-folder"
,
type
=
str
,
required
=
True
,
help
=
"The folder where the results are stored."
,
)
parser
.
add_argument
(
"--description"
,
type
=
str
,
required
=
True
,
help
=
"Description of the results."
)
args
=
parser
.
parse_args
()
return
args
def
get_perf
(
df
,
method
,
model
,
metric
):
means
=
[]
for
qps
in
[
2
,
4
,
8
,
16
,
"inf"
]:
target
=
df
[
"Test name"
].
str
.
contains
(
model
)
target
=
target
&
df
[
"Engine"
].
str
.
contains
(
method
)
target
=
target
&
df
[
"Test name"
].
str
.
contains
(
"qps_"
+
str
(
qps
))
filtered_df
=
df
[
target
]
if
filtered_df
.
empty
:
means
.
append
(
0.0
)
else
:
means
.
append
(
filtered_df
[
metric
].
values
[
0
])
return
np
.
array
(
means
)
def
get_perf_w_std
(
df
,
method
,
model
,
metric
):
if
metric
in
[
"TTFT"
,
"ITL"
]:
mean
=
get_perf
(
df
,
method
,
model
,
"Mean "
+
metric
+
" (ms)"
)
mean
=
mean
.
tolist
()
std
=
get_perf
(
df
,
method
,
model
,
"Std "
+
metric
+
" (ms)"
)
if
std
.
mean
()
==
0
:
std
=
None
success
=
get_perf
(
df
,
method
,
model
,
"Successful req."
)
if
std
is
not
None
:
std
=
std
/
np
.
sqrt
(
success
)
std
=
std
.
tolist
()
else
:
assert
metric
==
"Tput"
mean
=
get_perf
(
df
,
method
,
model
,
"Input Tput (tok/s)"
)
+
get_perf
(
df
,
method
,
model
,
"Output Tput (tok/s)"
)
mean
=
mean
.
tolist
()
std
=
None
return
mean
,
std
def
main
(
args
):
results_folder
=
Path
(
args
.
results_folder
)
results
=
[]
# collect results
for
test_file
in
results_folder
.
glob
(
"*_nightly_results.json"
):
with
open
(
test_file
)
as
f
:
results
=
results
+
json
.
loads
(
f
.
read
())
# generate markdown table
df
=
pd
.
DataFrame
.
from_dict
(
results
)
md_table
=
tabulate
(
df
,
headers
=
"keys"
,
tablefmt
=
"pipe"
,
showindex
=
False
)
with
open
(
args
.
description
)
as
f
:
description
=
f
.
read
()
description
=
description
.
format
(
nightly_results_benchmarking_table
=
md_table
)
with
open
(
"nightly_results.md"
,
"w"
)
as
f
:
f
.
write
(
description
)
if
__name__
==
"__main__"
:
args
=
parse_arguments
()
main
(
args
)
Prev
1
2
3
4
5
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment