Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1630 additions
and
374 deletions
+1630
-374
.buildkite/hardware_tests/amd.yaml
.buildkite/hardware_tests/amd.yaml
+3
-2
.buildkite/hardware_tests/cpu.yaml
.buildkite/hardware_tests/cpu.yaml
+14
-0
.buildkite/image_build/image_build.sh
.buildkite/image_build/image_build.sh
+11
-12
.buildkite/image_build/image_build.yaml
.buildkite/image_build/image_build.yaml
+1
-2
.buildkite/image_build/image_build_cpu.sh
.buildkite/image_build/image_build_cpu.sh
+6
-8
.buildkite/image_build/image_build_cpu_arm64.sh
.buildkite/image_build/image_build_cpu_arm64.sh
+5
-5
.buildkite/image_build/image_build_hpu.sh
.buildkite/image_build/image_build_hpu.sh
+5
-5
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
.../lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+2
-2
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+1
-1
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+1
-1
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
...kite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+2
-5
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+8
-2
.buildkite/performance-benchmarks/README.md
.buildkite/performance-benchmarks/README.md
+0
-1
.buildkite/performance-benchmarks/scripts/compare-json-results.py
...te/performance-benchmarks/scripts/compare-json-results.py
+628
-126
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
...formance-benchmarks/scripts/run-performance-benchmarks.sh
+453
-55
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
...dkite/performance-benchmarks/tests/latency-tests-hpu.json
+51
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
...e/performance-benchmarks/tests/serving-tests-cpu-asr.json
+37
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
...performance-benchmarks/tests/serving-tests-cpu-embed.json
+41
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
.../performance-benchmarks/tests/serving-tests-cpu-text.json
+355
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
...dkite/performance-benchmarks/tests/serving-tests-cpu.json
+6
-147
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
.buildkite/hardware_tests/amd.yaml
View file @
3fb4b5fa
group
:
Hardware
group
:
Hardware
- AMD Build
steps
:
-
label
:
"
AMD:
:docker:
build
image"
key
:
image-build-amd
depends_on
:
[]
device
:
amd_cpu
no_plugin
:
true
...
...
@@ -9,7 +10,7 @@ steps:
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942
;gfx950
'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
...
...
.buildkite/hardware_tests/cpu.yaml
View file @
3fb4b5fa
...
...
@@ -21,6 +21,20 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
-
label
:
CPU-Compatibility Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
-
cmake/cpu_extension.cmake
-
setup.py
-
vllm/platforms/cpu.py
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
-
label
:
CPU-Language Generation and Pooling Model Tests
depends_on
:
[]
soft_fail
:
true
...
...
.buildkite/image_build/image_build.sh
View file @
3fb4b5fa
...
...
@@ -8,7 +8,7 @@ clean_docker_tag() {
}
print_usage_and_exit
()
{
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <
vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>
"
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <
image_tag> [<image_tag_latest>]
"
exit
1
}
...
...
@@ -142,11 +142,16 @@ resolve_parent_commit() {
print_bake_config
()
{
echo
"--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE
=
"bake-config-build-
${
BUILDKITE_BUILD_NUMBER
:-
local
}
.json"
# Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local
bake_tmp
bake_tmp
=
"
$(
mktemp
-d
)
"
BAKE_CONFIG_FILE
=
"
${
bake_tmp
}
/bake-config-build-
${
BUILDKITE_BUILD_NUMBER
:-
local
}
.json"
docker buildx bake
-f
"
${
VLLM_BAKE_FILE_PATH
}
"
-f
"
${
CI_HCL_PATH
}
"
--print
"
${
TARGET
}
"
|
tee
"
${
BAKE_CONFIG_FILE
}
"
||
true
echo
"Saved bake config to
${
BAKE_CONFIG_FILE
}
"
echo
"--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload
"
${
BAKE_CONFIG_FILE
}
"
(
cd
"
$(
dirname
"
${
BAKE_CONFIG_FILE
}
"
)
"
&&
buildkite-agent artifact upload
"
$(
basename
"
${
BAKE_CONFIG_FILE
}
"
)
"
)
}
#################################
...
...
@@ -154,7 +159,7 @@ print_bake_config() {
#################################
print_instance_info
if
[[
$#
-lt
7
]]
;
then
if
[[
$#
-lt
5
]]
;
then
print_usage_and_exit
fi
...
...
@@ -163,10 +168,8 @@ REGISTRY=$1
REPO
=
$2
BUILDKITE_COMMIT
=
$3
BRANCH
=
$4
VLLM_USE_PRECOMPILED
=
$5
VLLM_MERGE_BASE_COMMIT
=
$6
IMAGE_TAG
=
$7
IMAGE_TAG_LATEST
=
${
8
:-}
# only used for main branch, optional
IMAGE_TAG
=
$5
IMAGE_TAG_LATEST
=
${
6
:-}
# only used for main branch, optional
# build config
TARGET
=
"test-ci"
...
...
@@ -193,8 +196,6 @@ export CACHE_FROM
export
CACHE_FROM_BASE_BRANCH
export
CACHE_FROM_MAIN
export
CACHE_TO
export
VLLM_USE_PRECOMPILED
export
VLLM_MERGE_BASE_COMMIT
# print args
echo
"--- :mag: Arguments"
...
...
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
echo
"REPO:
${
REPO
}
"
echo
"BUILDKITE_COMMIT:
${
BUILDKITE_COMMIT
}
"
echo
"BRANCH:
${
BRANCH
}
"
echo
"VLLM_USE_PRECOMPILED:
${
VLLM_USE_PRECOMPILED
}
"
echo
"VLLM_MERGE_BASE_COMMIT:
${
VLLM_MERGE_BASE_COMMIT
}
"
echo
"IMAGE_TAG:
${
IMAGE_TAG
}
"
echo
"IMAGE_TAG_LATEST:
${
IMAGE_TAG_LATEST
}
"
...
...
.buildkite/image_build/image_build.yaml
View file @
3fb4b5fa
...
...
@@ -5,8 +5,7 @@ steps:
depends_on
:
[]
timeout_in_minutes
:
600
commands
:
-
if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
-
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
-
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
retry
:
automatic
:
-
exit_status
:
-1
# Agent was lost
...
...
.buildkite/image_build/image_build_cpu.sh
View file @
3fb4b5fa
...
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
else
echo
"Image found"
...
...
@@ -24,13 +24,11 @@ fi
# build
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
VLLM_CPU_AVX512BF16
=
true
\
--build-arg
VLLM_CPU_AVX512VNNI
=
true
\
--build-arg
VLLM_CPU_AMXBF16
=
true
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--build-arg
VLLM_CPU_X86
=
true
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
\
--target
vllm-test
\
--progress
plain
.
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
.buildkite/image_build/image_build_cpu_arm64.sh
View file @
3fb4b5fa
...
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
else
echo
"Image found"
...
...
@@ -24,10 +24,10 @@ fi
# build
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
\
--target
vllm-test
\
--progress
plain
.
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
.buildkite/image_build/image_build_hpu.sh
View file @
3fb4b5fa
...
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
else
echo
"Image found"
...
...
@@ -25,10 +25,10 @@ fi
docker build
\
--file
tests/pytorch_ci_hud_benchmark/Dockerfile.hpu
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
\
--progress
plain
\
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
View file @
3fb4b5fa
...
...
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
echo
``
...
...
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks
chartqa
\
--batch_size
auto
\
--apply_chat_template
\
--limit
$LIMIT
--limit
"
$LIMIT
"
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
View file @
3fb4b5fa
...
...
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
echo
``
...
...
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
View file @
3fb4b5fa
...
...
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
echo
``
...
...
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
View file @
3fb4b5fa
...
...
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
echo
``
...
...
@@ -20,14 +20,11 @@ usage() {
echo
}
while
getopts
"m:
b:
l:f:t:"
OPT
;
do
while
getopts
"m:l:f:t:"
OPT
;
do
case
${
OPT
}
in
m
)
MODEL
=
"
$OPTARG
"
;;
b
)
BATCH_SIZE
=
"
$OPTARG
"
;;
l
)
LIMIT
=
"
$OPTARG
"
;;
...
...
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
3fb4b5fa
...
...
@@ -13,9 +13,10 @@ import os
from
contextlib
import
contextmanager
import
lm_eval
import
numpy
as
np
import
yaml
from
vllm.platforms
import
current_platform
DEFAULT_RTOL
=
0.08
...
...
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True,"
)
if
current_platform
.
is_rocm
()
and
"Nemotron-3"
in
eval_config
[
"model_name"
]:
model_args
+=
"attention_backend=TRITON_ATTN"
env_vars
=
eval_config
.
get
(
"env_vars"
,
None
)
with
scoped_env_vars
(
env_vars
):
results
=
lm_eval
.
simple_evaluate
(
...
...
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f
"ground_truth=
{
ground_truth
:.
3
f
}
| "
f
"measured=
{
measured_value
:.
3
f
}
| rtol=
{
rtol
}
"
)
success
=
success
and
np
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
rtol
)
min_acceptable
=
ground_truth
*
(
1
-
rtol
)
success
=
success
and
measured_value
>=
min_acceptable
assert
success
.buildkite/performance-benchmarks/README.md
View file @
3fb4b5fa
...
...
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"tensor_parallel_size"
:
1
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
},
...
...
.buildkite/performance-benchmarks/scripts/compare-json-results.py
View file @
3fb4b5fa
...
...
@@ -7,8 +7,10 @@ import argparse
import
html
as
_html
import
json
import
os
from
contextlib
import
nullcontext
from
dataclasses
import
dataclass
from
importlib
import
util
from
pathlib
import
Path
import
pandas
as
pd
...
...
@@ -31,6 +33,45 @@ pd.set_option("display.precision", 2)
pd
.
set_option
(
"display.float_format"
,
lambda
x
:
f
"
{
x
:.
2
f
}
"
)
# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def
_find_concurrency_col
(
df
:
pd
.
DataFrame
)
->
str
:
for
c
in
[
"# of max concurrency."
,
"# of max concurrency"
,
"Max Concurrency"
,
"max_concurrency"
,
"Concurrency"
,
]:
if
c
in
df
.
columns
:
return
c
for
c
in
df
.
columns
:
if
"concurr"
in
str
(
c
).
lower
():
s
=
df
[
c
]
if
s
.
dtype
.
kind
in
"iu"
and
s
.
nunique
()
>
1
and
s
.
min
()
>=
1
:
return
c
raise
ValueError
(
"Cannot infer concurrency column. "
"Please rename the column to one of the known names "
"or add an explicit override (e.g., --concurrency-col)."
)
def
_normalize_concurrency_in_df
(
df
:
pd
.
DataFrame
,
canonical
:
str
=
"# of max concurrency."
)
->
pd
.
DataFrame
:
if
canonical
in
df
.
columns
:
return
df
detected
=
_find_concurrency_col
(
df
)
if
detected
in
df
.
columns
and
detected
!=
canonical
:
return
df
.
rename
(
columns
=
{
detected
:
canonical
})
df
[
canonical
]
=
pd
.
NA
return
df
# -----------------------------
# Core data compare
# -----------------------------
...
...
@@ -50,19 +91,25 @@ def compare_data_columns(
- Concat along axis=1 (indexes align), then reset_index so callers can
group by columns.
- If --debug, add a <file_label>_name column per file.
Minimal fix to support different max_concurrency lists across files:
- normalize concurrency column naming to "# of max concurrency."
- align on UNION of keys (missing points become NaN)
- BUGFIX: don't drop throughput rows based on P99/Median presence
"""
print
(
"
\n
compare_data_column:"
,
data_column
)
frames
=
[]
raw_data_cols
:
list
[
str
]
=
[]
compare_frames
=
[]
# Determine key cols after normalizing concurrency
cols_per_file
:
list
[
set
]
=
[]
for
f
in
files
:
try
:
df_tmp
=
pd
.
read_json
(
f
,
orient
=
"records"
)
except
Exception
as
err
:
raise
ValueError
(
f
"Failed to read
{
f
}
"
)
from
err
df_tmp
=
_normalize_concurrency_in_df
(
df_tmp
,
canonical
=
"# of max concurrency."
)
cols_per_file
.
append
(
set
(
df_tmp
.
columns
))
key_cols
=
[
c
for
c
in
info_cols
if
all
(
c
in
cset
for
cset
in
cols_per_file
)]
...
...
@@ -73,12 +120,25 @@ def compare_data_columns(
"No common key columns found from info_cols across the input files."
)
meta_added
=
False
union_index
=
None
metas
:
list
[
pd
.
DataFrame
]
=
[]
staged
:
list
[
tuple
[
str
,
pd
.
Series
,
pd
.
Series
|
None
]]
=
[]
for
file
in
files
:
df
=
pd
.
read_json
(
file
,
orient
=
"records"
)
if
drop_column
in
df
.
columns
:
df
=
_normalize_concurrency_in_df
(
df
,
canonical
=
"# of max concurrency."
)
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
# NaN in P99/Median columns even if the column exists in the JSON.
metric_lc
=
str
(
data_column
).
lower
()
is_latency_metric
=
(
"ttft"
in
metric_lc
or
"tpot"
in
metric_lc
or
"p99"
in
metric_lc
or
"median"
in
metric_lc
or
metric_lc
.
strip
()
in
{
"p99"
,
"median"
}
)
if
is_latency_metric
and
drop_column
in
df
.
columns
:
df
=
df
.
dropna
(
subset
=
[
drop_column
],
ignore_index
=
True
)
for
c
in
(
...
...
@@ -103,35 +163,61 @@ def compare_data_columns(
meta
=
meta
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
file_label
=
"/"
.
join
(
file
.
split
(
"/"
)[:
-
1
])
or
os
.
path
.
basename
(
file
)
s
=
df_idx
[
data_column
]
if
not
s
.
index
.
is_unique
:
s
=
s
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
mean
()
s
.
name
=
file_label
if
not
meta_added
:
frames
.
append
(
meta
)
meta_added
=
True
if
data_column
in
df_idx
.
columns
:
s
=
df_idx
[
data_column
]
if
not
s
.
index
.
is_unique
:
s
=
s
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
mean
()
else
:
# keep NA series to preserve meta keys for union_index
s
=
pd
.
Series
(
pd
.
NA
,
index
=
meta
.
index
)
s
.
name
=
file_label
name_s
=
None
if
debug
and
name_column
in
df_idx
.
columns
:
name_s
=
df_idx
[
name_column
]
if
not
name_s
.
index
.
is_unique
:
name_s
=
name_s
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
name_s
.
name
=
f
"
{
file_label
}
_name"
frames
.
append
(
name_s
)
frames
.
append
(
s
)
if
union_index
is
None
:
union_index
=
meta
.
index
else
:
union_index
=
union_index
.
union
(
meta
.
index
)
metas
.
append
(
meta
)
staged
.
append
((
file_label
,
s
,
name_s
))
if
union_index
is
None
:
raise
ValueError
(
"No data found after loading inputs."
)
# meta first (union-aligned): build UNION meta across all files
if
metas
:
meta_union
=
pd
.
concat
(
metas
,
axis
=
0
)
# Collapse duplicates on the MultiIndex; keep first non-null per column
meta_union
=
meta_union
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
frames
.
append
(
meta_union
.
reindex
(
union_index
))
# values + ratios (union-aligned)
metric_series_aligned
:
list
[
pd
.
Series
]
=
[]
for
file_label
,
s
,
name_s
in
staged
:
s_aligned
=
s
.
reindex
(
union_index
)
frames
.
append
(
s_aligned
)
raw_data_cols
.
append
(
file_label
)
compare_frames
.
append
(
s
)
metric_series_aligned
.
append
(
s_aligned
)
if
debug
and
name_s
is
not
None
:
frames
.
append
(
name_s
.
reindex
(
union_index
))
if
len
(
compare_frames
)
>=
2
:
base
=
compare_frames
[
0
]
current
=
compare_frames
[
-
1
]
if
"P99"
in
data_column
or
"Median"
in
data_column
:
if
len
(
metric_series_aligned
)
>=
2
:
base
=
metric_series_aligned
[
0
]
current
=
metric_series_aligned
[
-
1
]
if
"P99"
in
str
(
data_column
)
or
"Median"
in
str
(
data_column
)
:
ratio
=
base
/
current
else
:
ratio
=
current
/
base
ratio
=
ratio
.
mask
(
base
==
0
)
ratio
.
name
=
f
"Ratio 1 vs
{
len
(
compare_frames
)
}
"
ratio
.
name
=
f
"Ratio 1 vs
{
len
(
metric_series_aligned
)
}
"
frames
.
append
(
ratio
)
concat_df
=
pd
.
concat
(
frames
,
axis
=
1
).
reset_index
(
drop
=
True
)
...
...
@@ -202,24 +288,10 @@ def split_json_by_tp_pp(
# -----------------------------
# Styling helpers
# -----------------------------
def
_find_concurrency_col
(
df
:
pd
.
DataFrame
)
->
str
:
for
c
in
[
"# of max concurrency."
,
"# of max concurrency"
,
"Max Concurrency"
,
"max_concurrency"
,
"Concurrency"
,
]:
if
c
in
df
.
columns
:
return
c
for
c
in
df
.
columns
:
if
df
[
c
].
dtype
.
kind
in
"iu"
and
df
[
c
].
nunique
()
>
1
and
df
[
c
].
min
()
>=
1
:
return
c
return
"# of max concurrency."
def
_highlight_threshold
(
df
:
pd
.
DataFrame
,
threshold
:
float
df
:
pd
.
DataFrame
,
threshold
:
float
,
slack_pct
:
float
=
0.0
,
)
->
pd
.
io
.
formats
.
style
.
Styler
:
conc_col
=
_find_concurrency_col
(
df
)
key_cols
=
[
...
...
@@ -232,12 +304,24 @@ def _highlight_threshold(
]
conf_cols
=
[
c
for
c
in
conf_cols
if
pd
.
api
.
types
.
is_numeric_dtype
(
df
[
c
])]
return
df
.
style
.
map
(
lambda
v
:
"background-color:#e6ffe6;font-weight:bold;"
if
pd
.
notna
(
v
)
and
v
<=
threshold
else
""
,
subset
=
conf_cols
,
)
try
:
slack_pct
=
float
(
slack_pct
or
0.0
)
except
Exception
:
slack_pct
=
0.0
slack_limit
=
threshold
*
(
1.0
+
slack_pct
/
100.0
)
def
_cell
(
v
):
if
pd
.
isna
(
v
):
return
""
if
v
<=
threshold
:
# Strict SLA
return
"background-color:#e6ffe6;font-weight:bold;"
if
v
<=
slack_limit
:
# Within slack range
return
"background-color:#ffe5cc;font-weight:bold;"
return
""
return
df
.
style
.
map
(
_cell
,
subset
=
conf_cols
)
def
highlight_ratio_columns
(
styler
:
pd
.
io
.
formats
.
style
.
Styler
):
...
...
@@ -275,6 +359,177 @@ def _apply_two_decimals(
return
styler
.
format
({
c
:
"{:.2f}"
for
c
in
num_cols
},
na_rep
=
""
)
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def
_sanitize_sheet_name
(
name
:
str
)
->
str
:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
module's compile overhead/edge-cases on some systems.
"""
name
=
"sheet"
if
name
is
None
else
str
(
name
)
# Replace illegal characters with underscore.
trans
=
str
.
maketrans
(
{
":"
:
"_"
,
"
\\
"
:
"_"
,
"/"
:
"_"
,
"?"
:
"_"
,
"*"
:
"_"
,
"["
:
"_"
,
"]"
:
"_"
,
}
)
name
=
name
.
translate
(
trans
)
# Strip quotes/spaces and collapse whitespace.
name
=
name
.
strip
().
strip
(
"'"
)
name
=
" "
.
join
(
name
.
split
())
if
not
name
:
name
=
"sheet"
return
name
[:
31
]
def
_group_to_sheet_base
(
group_cols
:
list
[
str
],
gkey_tuple
)
->
str
:
d
=
dict
(
zip
(
group_cols
,
gkey_tuple
))
# Always keep input/output lengths (these are important).
ilen
=
d
.
get
(
"Input Len"
,
""
)
olen
=
d
.
get
(
"Output Len"
,
""
)
lens
=
f
"_
{
ilen
}
x
{
olen
}
"
if
ilen
!=
""
and
olen
!=
""
else
""
# Shorten model name aggressively to make room for lens.
model
=
d
.
get
(
"Model"
,
"model"
)
leaf
=
str
(
model
).
split
(
"/"
)[
-
1
]
max_model_len
=
max
(
1
,
31
-
len
(
lens
))
model_short
=
leaf
[:
max_model_len
]
return
_sanitize_sheet_name
(
f
"
{
model_short
}{
lens
}
"
)
def
_write_tables_to_excel_sheet
(
writer
:
pd
.
ExcelWriter
,
sheet
:
str
,
blocks
:
list
[
tuple
[
str
,
pd
.
DataFrame
]]
):
"""Write all blocks to a sheet with a single to_excel() call.
Pandas+openpyxl can be extremely slow when called many times per sheet.
We flatten blocks into one table with a 'Section' column to keep structure
while making Excel generation fast and deterministic.
"""
if
not
blocks
:
pd
.
DataFrame
().
to_excel
(
writer
,
sheet_name
=
sheet
,
index
=
False
)
return
combined_parts
:
list
[
pd
.
DataFrame
]
=
[]
for
title
,
df
in
blocks
:
df2
=
df
.
copy
()
# Put the section label as the first column for readability.
df2
.
insert
(
0
,
"Section"
,
title
)
combined_parts
.
append
(
df2
)
combined
=
pd
.
concat
(
combined_parts
,
axis
=
0
,
ignore_index
=
True
,
sort
=
False
)
combined
.
to_excel
(
writer
,
sheet_name
=
sheet
,
index
=
False
)
def
_safe_filename
(
s
:
str
)
->
str
:
# Fast path without the third-party `regex` module.
s
=
" "
.
join
(
str
(
s
).
strip
().
split
())
allowed
=
[]
for
ch
in
s
:
if
ch
.
isalnum
()
or
ch
in
"._-"
:
allowed
.
append
(
ch
)
else
:
allowed
.
append
(
"_"
)
out
=
""
.
join
(
allowed
)
return
out
[:
180
]
if
len
(
out
)
>
180
else
out
# -----------------------------
# vLLM environment export helper
# -----------------------------
def
_parse_vllm_env_txt
(
env_path
:
Path
)
->
pd
.
DataFrame
:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines
=
env_path
.
read_text
(
encoding
=
"utf-8"
,
errors
=
"replace"
).
splitlines
()
section
=
"General"
rows
:
list
[
dict
]
=
[]
def
set_section
(
s
:
str
):
nonlocal
section
s
=
(
s
or
""
).
strip
()
if
s
:
section
=
s
for
raw
in
lines
:
stripped
=
raw
.
strip
()
if
not
stripped
:
continue
# divider lines like =====
if
set
(
stripped
)
<=
{
"="
}:
continue
# section header heuristic: short standalone line
if
":"
not
in
stripped
and
"="
not
in
stripped
and
len
(
stripped
)
<=
64
:
if
stripped
.
lower
().
startswith
(
"collecting environment information"
):
continue
set_section
(
stripped
)
continue
# env var style: KEY=VALUE (and not a URL with :)
if
"="
in
stripped
and
":"
not
in
stripped
:
k
,
v
=
stripped
.
split
(
"="
,
1
)
k
=
k
.
strip
()
v
=
v
.
strip
()
if
k
:
rows
.
append
({
"Section"
:
section
,
"Key"
:
k
,
"Value"
:
v
})
continue
# key: value
if
":"
in
stripped
:
k
,
v
=
stripped
.
split
(
":"
,
1
)
k
=
k
.
strip
()
v
=
v
.
strip
()
if
k
:
rows
.
append
({
"Section"
:
section
,
"Key"
:
k
,
"Value"
:
v
})
continue
return
pd
.
DataFrame
(
rows
,
columns
=
[
"Section"
,
"Key"
,
"Value"
])
def
_load_env_df_for_inputs
(
args
,
files
:
list
[
str
])
->
pd
.
DataFrame
|
None
:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir
:
Path
|
None
=
None
if
getattr
(
args
,
"file"
,
None
):
base_dir
=
Path
(
args
.
file
[
0
]).
resolve
().
parent
elif
files
:
base_dir
=
Path
(
files
[
0
]).
resolve
().
parent
if
base_dir
is
None
:
return
None
env_path
=
base_dir
/
"vllm_env.txt"
if
not
env_path
.
exists
():
return
None
df
=
_parse_vllm_env_txt
(
env_path
)
return
df
# -----------------------------
# Valid max concurrency summary helpers
# -----------------------------
...
...
@@ -301,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
def
_max_concurrency_ok
(
df
:
pd
.
DataFrame
,
conc_col
:
str
,
cfg_col
:
str
,
threshold
:
float
df
:
pd
.
DataFrame
,
conc_col
:
str
,
cfg_col
:
str
,
threshold
:
float
,
slack_pct
:
float
=
0.0
,
):
if
df
is
None
or
conc_col
not
in
df
.
columns
or
cfg_col
not
in
df
.
columns
:
return
pd
.
NA
...
...
@@ -314,7 +573,14 @@ def _max_concurrency_ok(
if
d
.
empty
:
return
pd
.
NA
ok
=
d
[
d
[
cfg_col
]
<=
threshold
]
# Accept values up to (1 + slack_pct%) above the SLA.
try
:
slack_pct
=
float
(
slack_pct
or
0.0
)
except
Exception
:
slack_pct
=
0.0
effective_limit
=
float
(
threshold
)
*
(
1.0
+
slack_pct
/
100.0
)
ok
=
d
[
d
[
cfg_col
]
<=
effective_limit
]
if
ok
.
empty
:
return
pd
.
NA
...
...
@@ -380,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
if
not
cfg_cols
:
cfg_cols
=
sorted
(
set
(
ttft_cols
)
|
set
(
tpot_cols
)
|
set
(
tput_cols
),
key
=
str
)
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
ttft_hi
=
args
.
ttft_max_ms
*
(
1.0
+
args
.
ttft_slack_pct
/
100.0
)
tpot_hi
=
args
.
tpot_max_ms
*
(
1.0
+
args
.
tpot_slack_pct
/
100.0
)
ttft_range
=
f
"
{
args
.
ttft_max_ms
:
g
}
–
{
ttft_hi
:
g
}
ms (+
{
args
.
ttft_slack_pct
:
g
}
%)"
tpot_range
=
f
"
{
args
.
tpot_max_ms
:
g
}
–
{
tpot_hi
:
g
}
ms (+
{
args
.
tpot_slack_pct
:
g
}
%)"
rows
=
[]
for
cfg
in
cfg_cols
:
ttft_max
=
(
_max_concurrency_ok
(
ttft_group_df
,
conc_col
,
cfg
,
args
.
ttft_max_ms
)
_max_concurrency_ok
(
ttft_group_df
,
conc_col
,
cfg
,
args
.
ttft_max_ms
,
args
.
ttft_slack_pct
)
if
ttft_group_df
is
not
None
else
pd
.
NA
)
tpot_max
=
(
_max_concurrency_ok
(
tpot_group_df
,
conc_col
,
cfg
,
args
.
tpot_max_ms
)
_max_concurrency_ok
(
tpot_group_df
,
conc_col
,
cfg
,
args
.
tpot_max_ms
,
args
.
tpot_slack_pct
)
if
tpot_group_df
is
not
None
else
pd
.
NA
)
...
...
@@ -417,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
rows
.
append
(
{
"Configuration"
:
cfg
,
f
"Max
{
conc_col
}
(TTFT ≤
{
args
.
ttft_max_ms
:
g
}
ms
)"
:
ttft_max
,
f
"Max
{
conc_col
}
(TPOT ≤
{
args
.
tpot_max_ms
:
g
}
ms
)"
:
tpot_max
,
f
"Max
{
conc_col
}
(TTFT ≤
{
ttft_range
}
)"
:
ttft_max
,
f
"Max
{
conc_col
}
(TPOT ≤
{
tpot_range
}
)"
:
tpot_max
,
f
"Max
{
conc_col
}
(Both)"
:
both
,
"Output Tput @ Both (tok/s)"
:
tput_at_both
,
"TTFT @ Both (ms)"
:
ttft_at_both
,
...
...
@@ -428,7 +704,6 @@ def build_valid_max_concurrency_summary_html(
summary_df
=
pd
.
DataFrame
(
rows
)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for
c
in
summary_df
.
columns
:
if
c
==
"Configuration"
:
continue
...
...
@@ -436,12 +711,10 @@ def build_valid_max_concurrency_summary_html(
both_col
=
f
"Max
{
conc_col
}
(Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters
=
{}
for
c
in
summary_df
.
columns
:
if
c
==
"Configuration"
:
continue
# default argument binds per-column formatter correctly
formatters
[
c
]
=
lambda
v
:
""
if
pd
.
isna
(
v
)
else
f
"
{
float
(
v
):.
2
f
}
"
styler
=
summary_df
.
style
.
format
(
formatters
)
...
...
@@ -460,6 +733,104 @@ def build_valid_max_concurrency_summary_html(
return
title
+
styler
.
to_html
(
table_attributes
=
'border="1" class="dataframe"'
)
def
build_valid_max_concurrency_summary_df
(
tput_group_df
:
pd
.
DataFrame
|
None
,
ttft_group_df
:
pd
.
DataFrame
|
None
,
tpot_group_df
:
pd
.
DataFrame
|
None
,
conc_col
:
str
,
args
,
)
->
pd
.
DataFrame
|
None
:
if
ttft_group_df
is
None
and
tpot_group_df
is
None
:
return
None
ttft_cols
=
(
_config_value_columns
(
ttft_group_df
,
conc_col
)
if
ttft_group_df
is
not
None
else
[]
)
tpot_cols
=
(
_config_value_columns
(
tpot_group_df
,
conc_col
)
if
tpot_group_df
is
not
None
else
[]
)
tput_cols
=
(
_config_value_columns
(
tput_group_df
,
conc_col
)
if
tput_group_df
is
not
None
else
[]
)
if
ttft_group_df
is
not
None
and
tpot_group_df
is
not
None
:
cfg_cols
=
[
c
for
c
in
ttft_cols
if
c
in
tpot_cols
]
if
tput_group_df
is
not
None
:
cfg_cols
=
[
c
for
c
in
cfg_cols
if
c
in
tput_cols
]
or
cfg_cols
else
:
cfg_cols
=
ttft_cols
or
tpot_cols
if
not
cfg_cols
:
cfg_cols
=
sorted
(
set
(
ttft_cols
)
|
set
(
tpot_cols
)
|
set
(
tput_cols
),
key
=
str
)
ttft_hi
=
args
.
ttft_max_ms
*
(
1.0
+
args
.
ttft_slack_pct
/
100.0
)
tpot_hi
=
args
.
tpot_max_ms
*
(
1.0
+
args
.
tpot_slack_pct
/
100.0
)
ttft_range
=
f
"
{
args
.
ttft_max_ms
:
g
}
–
{
ttft_hi
:
g
}
ms (+
{
args
.
ttft_slack_pct
:
g
}
%)"
tpot_range
=
f
"
{
args
.
tpot_max_ms
:
g
}
–
{
tpot_hi
:
g
}
ms (+
{
args
.
tpot_slack_pct
:
g
}
%)"
rows
=
[]
for
cfg
in
cfg_cols
:
ttft_max
=
(
_max_concurrency_ok
(
ttft_group_df
,
conc_col
,
cfg
,
args
.
ttft_max_ms
,
args
.
ttft_slack_pct
)
if
ttft_group_df
is
not
None
else
pd
.
NA
)
tpot_max
=
(
_max_concurrency_ok
(
tpot_group_df
,
conc_col
,
cfg
,
args
.
tpot_max_ms
,
args
.
tpot_slack_pct
)
if
tpot_group_df
is
not
None
else
pd
.
NA
)
both
=
(
pd
.
NA
if
(
pd
.
isna
(
ttft_max
)
or
pd
.
isna
(
tpot_max
))
else
min
(
ttft_max
,
tpot_max
)
)
tput_at_both
=
(
_value_at_concurrency
(
tput_group_df
,
conc_col
,
cfg
,
both
)
if
tput_group_df
is
not
None
else
pd
.
NA
)
ttft_at_both
=
(
_value_at_concurrency
(
ttft_group_df
,
conc_col
,
cfg
,
both
)
if
ttft_group_df
is
not
None
else
pd
.
NA
)
tpot_at_both
=
(
_value_at_concurrency
(
tpot_group_df
,
conc_col
,
cfg
,
both
)
if
tpot_group_df
is
not
None
else
pd
.
NA
)
rows
.
append
(
{
"Configuration"
:
cfg
,
f
"Max
{
conc_col
}
(TTFT ≤
{
ttft_range
}
)"
:
ttft_max
,
f
"Max
{
conc_col
}
(TPOT ≤
{
tpot_range
}
)"
:
tpot_max
,
f
"Max
{
conc_col
}
(Both)"
:
both
,
"Output Tput @ Both (tok/s)"
:
tput_at_both
,
"TTFT @ Both (ms)"
:
ttft_at_both
,
"TPOT @ Both (ms)"
:
tpot_at_both
,
}
)
df
=
pd
.
DataFrame
(
rows
)
for
c
in
df
.
columns
:
if
c
!=
"Configuration"
:
df
[
c
]
=
pd
.
to_numeric
(
df
[
c
],
errors
=
"coerce"
)
return
df
# -----------------------------
# Plot helper
# -----------------------------
...
...
@@ -537,6 +908,35 @@ def build_parser() -> argparse.ArgumentParser:
default
=
100.0
,
help
=
"Reference limit for TPOT plots (ms)"
,
)
# ---- SLA tolerance (slack) options ----
parser
.
add_argument
(
"--ttft-slack-pct"
,
type
=
float
,
default
=
5.0
,
help
=
"Allowed percentage above TTFT SLA (default: 5)."
,
)
parser
.
add_argument
(
"--tpot-slack-pct"
,
type
=
float
,
default
=
5.0
,
help
=
"Allowed percentage above TPOT SLA (default: 5)."
,
)
# ---- export options ----
parser
.
add_argument
(
"--excel-out"
,
type
=
str
,
default
=
"perf_comparison.xlsx"
,
help
=
"Write one sheet per (Model, Dataset, Input Len, Output Len)."
,
)
parser
.
add_argument
(
"--csv-out-dir"
,
type
=
str
,
default
=
""
,
help
=
"If set, write per-group per-metric CSVs into this directory."
,
)
return
parser
...
...
@@ -615,9 +1015,13 @@ def render_metric_table_html(
metric_name
=
metric_label
.
lower
()
if
"ttft"
in
metric_name
:
styler
=
_highlight_threshold
(
display_group
,
args
.
ttft_max_ms
)
styler
=
_highlight_threshold
(
display_group
,
args
.
ttft_max_ms
,
args
.
ttft_slack_pct
)
elif
(
"tpot"
in
metric_name
)
or
(
"median"
in
metric_name
)
or
(
"p99"
in
metric_name
):
styler
=
_highlight_threshold
(
display_group
,
args
.
tpot_max_ms
)
styler
=
_highlight_threshold
(
display_group
,
args
.
tpot_max_ms
,
args
.
tpot_slack_pct
)
else
:
styler
=
display_group
.
style
...
...
@@ -657,7 +1061,6 @@ def maybe_write_plot(
markers
=
True
,
)
# Ensure plot hover + y tick labels are also 2 decimals.
fig
.
update_traces
(
hovertemplate
=
"%{y:.2f}<extra></extra>"
)
fig
.
update_yaxes
(
tickformat
=
".2f"
)
...
...
@@ -730,87 +1133,186 @@ def write_report_group_first(
for
metric_label
,
(
df
,
_
)
in
metric_cache
.
items
()
}
with
open
(
"perf_comparison.html"
,
"w"
,
encoding
=
"utf-8"
)
as
main_fh
:
main_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
for
gkey
in
group_keys
:
gkey_tuple
=
normalize_group_key
(
gkey
)
suffix
=
build_group_suffix
(
group_cols_canonical
,
gkey_tuple
)
sub_path
=
group_filename
(
gkey_tuple
)
group_header
=
(
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f
"
{
_html
.
escape
(
suffix
)
}
"
"</div>
\n
"
)
main_fh
.
write
(
group_header
)
with
open
(
sub_path
,
"w"
,
encoding
=
"utf-8"
)
as
sub_fh
:
sub_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
sub_fh
.
write
(
group_header
)
tput_group_df
=
None
ttft_group_df
=
None
tpot_group_df
=
None
conc_col
=
args
.
xaxis
for
metric_label
in
plan
.
data_cols
:
gb
=
metric_groupbys
[
metric_label
]
df_sorted
,
raw_data_cols
=
metric_cache
[
metric_label
]
try
:
group_df
=
gb
.
get_group
(
gkey
)
except
KeyError
:
missing
=
(
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f
"
{
_html
.
escape
(
metric_label
)
}
— missing for this group"
"</div>
\n
"
)
csv_dir
=
Path
(
args
.
csv_out_dir
)
if
args
.
csv_out_dir
else
None
if
csv_dir
:
csv_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
main_fh
.
write
(
missing
)
sub_fh
.
write
(
missing
)
continue
excel_path
=
args
.
excel_out
or
"perf_comparison.xlsx"
disable_excel
=
os
.
getenv
(
"VLLM_COMPARE_DISABLE_EXCEL"
,
"0"
)
==
"1"
if
conc_col
not
in
group_df
.
columns
:
conc_col
=
_find_concurrency_col
(
group_df
)
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
excel_engine
=
(
os
.
getenv
(
"VLLM_COMPARE_EXCEL_ENGINE"
,
"xlsxwriter"
).
strip
()
or
"xlsxwriter"
)
if
excel_engine
==
"xlsxwriter"
and
util
.
find_spec
(
"xlsxwriter"
)
is
None
:
excel_engine
=
"openpyxl"
excel_engine_kwargs
=
{}
if
excel_engine
==
"xlsxwriter"
:
# Reduce memory pressure & usually faster writes.
excel_engine_kwargs
=
{
"options"
:
{
"constant_memory"
:
True
}}
xw_ctx
=
(
nullcontext
(
None
)
if
disable_excel
else
pd
.
ExcelWriter
(
excel_path
,
engine
=
excel_engine
,
engine_kwargs
=
excel_engine_kwargs
)
)
with
xw_ctx
as
xw
:
used_sheets
:
set
[
str
]
=
set
()
# ---- Environment sheet (first) ----
env_sheet
=
_sanitize_sheet_name
(
"Environment"
)
env_df
=
_load_env_df_for_inputs
(
args
,
files
)
if
xw
is
not
None
:
if
env_df
is
None
or
env_df
.
empty
:
pd
.
DataFrame
(
[
{
"Section"
:
"Environment"
,
"Key"
:
"vllm_env.txt"
,
"Value"
:
"NOT FOUND (or empty)"
,
}
]
).
to_excel
(
xw
,
sheet_name
=
env_sheet
,
index
=
False
)
else
:
env_df
.
to_excel
(
xw
,
sheet_name
=
env_sheet
,
index
=
False
)
used_sheets
.
add
(
env_sheet
)
with
open
(
"perf_comparison.html"
,
"w"
,
encoding
=
"utf-8"
)
as
main_fh
:
main_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
for
gkey
in
group_keys
:
gkey_tuple
=
normalize_group_key
(
gkey
)
suffix
=
build_group_suffix
(
group_cols_canonical
,
gkey_tuple
)
sub_path
=
group_filename
(
gkey_tuple
)
group_header
=
(
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f
"
{
_html
.
escape
(
suffix
)
}
"
"</div>
\n
"
)
mn
=
metric_label
.
lower
().
strip
()
if
"tok/s"
in
mn
:
tput_group_df
=
group_df
elif
"ttft"
in
mn
:
ttft_group_df
=
group_df
elif
mn
in
(
"p99"
,
"median"
)
or
"tpot"
in
mn
:
tpot_group_df
=
group_df
main_fh
.
write
(
group_header
)
do_excel
=
xw
is
not
None
sheet
=
_group_to_sheet_base
(
group_cols_canonical
,
gkey_tuple
)
sheet_base
=
sheet
if
do_excel
:
dedup_i
=
1
while
sheet
in
used_sheets
:
dedup_i
+=
1
suffix
=
f
"_
{
dedup_i
}
"
# Ensure uniqueness even when sheet names are truncated.
base
=
str
(
sheet_base
)
keep
=
max
(
1
,
31
-
len
(
suffix
))
sheet
=
_sanitize_sheet_name
(
base
[:
keep
]
+
suffix
)
used_sheets
.
add
(
sheet
)
excel_blocks
:
list
[
tuple
[
str
,
pd
.
DataFrame
]]
=
[]
with
open
(
sub_path
,
"w"
,
encoding
=
"utf-8"
)
as
sub_fh
:
sub_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
sub_fh
.
write
(
group_header
)
tput_group_df
=
None
ttft_group_df
=
None
tpot_group_df
=
None
conc_col
=
args
.
xaxis
for
metric_label
in
plan
.
data_cols
:
gb
=
metric_groupbys
[
metric_label
]
df_sorted
,
raw_data_cols
=
metric_cache
[
metric_label
]
try
:
group_df
=
gb
.
get_group
(
gkey
)
except
KeyError
:
missing
=
(
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f
"
{
_html
.
escape
(
metric_label
)
}
— missing for this group"
"</div>
\n
"
)
main_fh
.
write
(
missing
)
sub_fh
.
write
(
missing
)
continue
if
conc_col
not
in
group_df
.
columns
:
conc_col
=
_find_concurrency_col
(
group_df
)
mn
=
metric_label
.
lower
().
strip
()
if
"tok/s"
in
mn
:
tput_group_df
=
group_df
elif
"ttft"
in
mn
:
ttft_group_df
=
group_df
elif
mn
in
(
"p99"
,
"median"
)
or
"tpot"
in
mn
:
tpot_group_df
=
group_df
display_group
=
group_df
.
drop
(
columns
=
group_cols_canonical
,
errors
=
"ignore"
)
display_group
=
group_df
.
drop
(
columns
=
group_cols_canonical
,
errors
=
"ignore"
)
html
=
render_metric_table_html
(
display_group
,
metric_label
,
suffix
,
args
)
main_fh
.
write
(
html
)
sub_fh
.
write
(
html
)
maybe_write_plot
(
main_fh
,
sub_fh
,
group_df
=
group_df
,
raw_data_cols
=
raw_data_cols
,
metric_label
=
metric_label
,
y_axis_col
=
y_axis_col
,
args
=
args
,
)
html
=
render_metric_table_html
(
display_group
,
metric_label
,
suffix
,
args
excel_blocks
.
append
(
(
metric_label
,
group_df
.
reset_index
(
drop
=
True
))
)
if
csv_dir
:
fn
=
_safe_filename
(
f
"
{
sheet
}
__
{
metric_label
}
"
.
replace
(
" "
,
"_"
).
replace
(
"/"
,
"_"
)
)
group_df
.
to_csv
(
csv_dir
/
f
"
{
fn
}
.csv"
,
index
=
False
)
summary_html
=
build_valid_max_concurrency_summary_html
(
tput_group_df
=
tput_group_df
,
ttft_group_df
=
ttft_group_df
,
tpot_group_df
=
tpot_group_df
,
conc_col
=
conc_col
,
args
=
args
,
)
main_fh
.
write
(
html
)
sub_fh
.
write
(
html
)
maybe_write_plot
(
main_fh
,
sub_fh
,
group_df
=
group_df
,
raw_data_cols
=
raw_data_cols
,
metric_label
=
metric_label
,
y_axis_col
=
y_axis_col
,
if
summary_html
:
main_fh
.
write
(
summary_html
)
sub_fh
.
write
(
summary_html
)
summary_df
=
build_valid_max_concurrency_summary_df
(
tput_group_df
=
tput_group_df
,
ttft_group_df
=
ttft_group_df
,
tpot_group_df
=
tpot_group_df
,
conc_col
=
conc_col
,
args
=
args
,
)
if
summary_df
is
not
None
:
excel_blocks
.
append
(
(
"Valid Max Concurrency Summary"
,
summary_df
)
)
if
csv_dir
:
fn
=
_safe_filename
(
f
"
{
sheet
}
__Valid_Max_Concurrency_Summary"
)
summary_df
.
to_csv
(
csv_dir
/
f
"
{
fn
}
.csv"
,
index
=
False
)
summary_html
=
build_valid_max_concurrency_summary_html
(
tput_group_df
=
tput_group_df
,
ttft_group_df
=
ttft_group_df
,
tpot_group_df
=
tpot_group_df
,
conc_col
=
conc_col
,
args
=
args
,
)
if
summary_html
:
main_fh
.
write
(
summary_html
)
sub_fh
.
write
(
summary_html
)
if
do_excel
:
_write_tables_to_excel_sheet
(
xw
,
sheet
,
excel_blocks
)
if
disable_excel
:
print
(
"Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1)."
)
else
:
print
(
f
"Wrote Excel:
{
excel_path
}
"
)
if
csv_dir
:
print
(
f
"Wrote CSVs under:
{
csv_dir
}
"
)
def
main
():
...
...
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
100755 → 100644
View file @
3fb4b5fa
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
...
...
@@ -9,14 +7,26 @@
set
-x
set
-o
pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN
=
"
${
DRY_RUN
:-
0
}
"
MODEL_FILTER
=
"
${
MODEL_FILTER
:-}
"
DTYPE_FILTER
=
"
${
DTYPE_FILTER
:-}
"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY
=
"
${
ENABLE_ADAPTIVE_CONCURRENCY
:-
0
}
"
SLA_TTFT_MS
=
"
${
SLA_TTFT_MS
:-
3000
}
"
SLA_TPOT_MS
=
"
${
SLA_TPOT_MS
:-
100
}
"
ADAPTIVE_MAX_PROBES
=
"
${
ADAPTIVE_MAX_PROBES
:-
8
}
"
ADAPTIVE_MAX_CONCURRENCY
=
"
${
ADAPTIVE_MAX_CONCURRENCY
:-
1024
}
"
check_gpus
()
{
if
command
-v
nvidia-smi
;
then
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
grep
-c
.
||
true
)
elif
command
-v
amd-smi
;
then
declare
-g
gpu_count
=
$(
amd-smi list |
grep
'GPU'
|
wc
-l
)
declare
-g
gpu_count
=
$(
amd-smi list |
grep
-c
'GPU'
|
|
true
)
elif
command
-v
hl-smi
;
then
declare
-g
gpu_count
=
$(
hl-smi
--list
|
grep
-i
"Module ID"
|
wc
-l
)
declare
-g
gpu_count
=
$(
hl-smi
--list
|
grep
-
c
i
"Module ID"
|
|
true
)
fi
if
[[
$gpu_count
-gt
0
]]
;
then
...
...
@@ -44,7 +54,7 @@ check_cpus() {
declare
-g
numa_count
=
$(
lscpu |
grep
"NUMA node(s):"
|
awk
'{print $3}'
)
if
[[
$numa_count
-gt
0
]]
;
then
echo
"NUMA found."
echo
$numa_count
echo
"
$numa_count
"
else
echo
"Need at least 1 NUMA to run benchmarking."
exit
1
...
...
@@ -112,13 +122,12 @@ json2envs() {
}
wait_for_server
()
{
# wait for vllm server to start
# return 1 if vllm server crashes
local
timeout_val
=
"1200"
timeout
"
$timeout_val
"
bash
-c
'
until curl -
X POST
localhost:8000/v1/
completions
; do
until curl -
sf http://
localhost:8000/v1/
models >/dev/null
; do
sleep 1
done'
&&
return
0
||
return
1
done
'
}
kill_processes_launched_by_current_bash
()
{
...
...
@@ -181,6 +190,304 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
}
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving
()
{
local
test_name
=
$1
local
qps
=
$2
local
max_concurrency
=
$3
echo
"
$RESULTS_FOLDER
/
${
test_name
}
_qps_
${
qps
}
_concurrency_
${
max_concurrency
}
.json"
}
extract_metric_ms
()
{
local
metric_name
=
$1
local
json_file
=
$2
[[
-f
"
$json_file
"
]]
||
return
0
if
[[
"
$metric_name
"
==
"ttft"
]]
;
then
jq
-r
'
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
'
"
$json_file
"
else
jq
-r
'
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
'
"
$json_file
"
fi
}
evaluate_sla_from_json
()
{
local
json_file
=
$1
local
ttft
local
tpot
local
pass
[[
-f
"
$json_file
"
]]
||
return
2
ttft
=
$(
extract_metric_ms ttft
"
$json_file
"
)
tpot
=
$(
extract_metric_ms tpot
"
$json_file
"
)
[[
-n
"
$ttft
"
&&
-n
"
$tpot
"
]]
||
return
2
pass
=
$(
jq
-n
\
--argjson
ttft
"
$ttft
"
\
--argjson
tpot
"
$tpot
"
\
--argjson
sla_ttft
"
$SLA_TTFT_MS
"
\
--argjson
sla_tpot
"
$SLA_TPOT_MS
"
\
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)'
)
[[
"
$pass
"
==
"true"
]]
}
write_adaptive_summary_json
()
{
local
summary_file
=
$1
local
test_name
=
$2
local
qps
=
$3
local
static_last_pass
=
$4
local
static_first_fail
=
$5
local
final_last_pass
=
$6
local
final_first_fail
=
$7
jq
-n
\
--arg
test_name
"
$test_name
"
\
--arg
qps
"
$qps
"
\
--argjson
sla_ttft
"
$SLA_TTFT_MS
"
\
--argjson
sla_tpot
"
$SLA_TPOT_MS
"
\
--arg
static_last_pass
"
${
static_last_pass
:-}
"
\
--arg
static_first_fail
"
${
static_first_fail
:-}
"
\
--arg
final_last_pass
"
${
final_last_pass
:-}
"
\
--arg
final_first_fail
"
${
final_first_fail
:-}
"
\
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}'
>
"
$summary_file
"
}
run_single_serving_probe
()
{
local
test_name
=
$1
local
qps
=
$2
local
max_concurrency
=
$3
local
tp
=
$4
local
compilation_config_mode
=
$5
local
optimization_level
=
$6
local
client_args_effective
=
$7
local
client_remote_args
=
$8
local
server_command
=
$9
local
new_test_name
=
"
${
test_name
}
_qps_
${
qps
}
_concurrency_
${
max_concurrency
}
"
local
result_json
local
num_prompts_arg
=
""
local
client_command
result_json
=
$(
result_json_path_for_serving
"
$test_name
"
"
$qps
"
"
$max_concurrency
"
)
if
[[
-f
"
$result_json
"
]]
;
then
evaluate_sla_from_json
"
$result_json
"
return
$?
fi
if
[[
-n
"
${
PROMPTS_PER_CONCURRENCY
}
"
]]
;
then
num_prompts
=
$((
max_concurrency
*
PROMPTS_PER_CONCURRENCY
))
if
((
num_prompts < MIN_NUM_PROMPTS
))
;
then
num_prompts
=
$MIN_NUM_PROMPTS
;
fi
if
((
num_prompts
>
MAX_NUM_PROMPTS
))
;
then
num_prompts
=
$MAX_NUM_PROMPTS
;
fi
num_prompts_arg
=
"--num-prompts
$num_prompts
"
fi
client_command
=
"vllm bench serve
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--max-concurrency
$max_concurrency
\
$num_prompts_arg
\
--metadata tensor_parallel_size=
$tp
compilation_config.mode=
$compilation_config_mode
optimization_level=
$optimization_level
adaptive_search=1
\
$client_args_effective
$client_remote_args
"
echo
"Adaptive probe:
$client_command
"
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$client_command
"
fi
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
evaluate_sla_from_json
"
$result_json
"
}
adaptive_refine_from_static_results
()
{
local
test_name
=
$1
local
qps
=
$2
local
max_concurrency_list_raw
=
$3
local
tp
=
$4
local
compilation_config_mode
=
$5
local
optimization_level
=
$6
local
client_args_effective
=
$7
local
client_remote_args
=
$8
local
server_command
=
$9
local
sorted_points
local
point
local
rc
local
static_last_pass
=
""
local
static_first_fail
=
""
local
largest_static
=
""
local
step_hint
=
1
local
previous_point
=
""
local
low
local
high
local
mid
local
probes
=
0
local
summary_file
=
"
$RESULTS_FOLDER
/
${
test_name
}
_qps_
${
qps
}
_sla_summary.json"
[[
"
${
ENABLE_ADAPTIVE_CONCURRENCY
}
"
==
"1"
]]
||
return
0
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
||
return
0
sorted_points
=
$(
for
point
in
$max_concurrency_list_raw
;
do
printf
'%s\n'
"
$point
"
;
done
|
tr
-d
"'"
|
awk
'/^[0-9]+$/'
|
sort
-n
|
uniq
)
[[
-n
"
$sorted_points
"
]]
||
return
0
while
read
-r
point
;
do
[[
-z
"
$point
"
]]
&&
continue
largest_static
=
"
$point
"
evaluate_sla_from_json
"
$(
result_json_path_for_serving
"
$test_name
"
"
$qps
"
"
$point
"
)
"
rc
=
$?
if
((
rc
==
0
))
;
then
static_last_pass
=
"
$point
"
elif
((
rc
==
1
))
;
then
if
[[
-n
"
$static_last_pass
"
]]
;
then
static_first_fail
=
"
$point
"
break
fi
fi
if
[[
-n
"
$previous_point
"
]]
;
then
step_hint
=
$((
point
-
previous_point
))
if
((
step_hint < 1
))
;
then
step_hint
=
1
;
fi
fi
previous_point
=
"
$point
"
done
<<<
"
$sorted_points
"
if
[[
-z
"
$static_last_pass
"
]]
;
then
write_adaptive_summary_json
"
$summary_file
"
"
$test_name
"
"
$qps
"
""
"
$static_first_fail
"
""
"
$static_first_fail
"
return
0
fi
if
[[
-n
"
$static_first_fail
"
]]
;
then
low
=
$static_last_pass
high
=
$static_first_fail
while
((
low + 1 < high
))
&&
((
probes < ADAPTIVE_MAX_PROBES
))
;
do
mid
=
$((
(
low
+
high
)
/
2
))
probes
=
$((
probes
+
1
))
run_single_serving_probe
\
"
$test_name
"
"
$qps
"
"
$mid
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
rc
=
$?
if
((
rc
==
0
))
;
then
low
=
$mid
elif
((
rc
==
1
))
;
then
high
=
$mid
else
break
fi
done
write_adaptive_summary_json
"
$summary_file
"
"
$test_name
"
"
$qps
"
"
$static_last_pass
"
"
$static_first_fail
"
"
$low
"
"
$high
"
return
0
fi
low
=
$largest_static
high
=
""
while
((
probes < ADAPTIVE_MAX_PROBES
))
;
do
point
=
$((
low
+
step_hint
))
if
((
point
>
ADAPTIVE_MAX_CONCURRENCY
))
;
then
point
=
$ADAPTIVE_MAX_CONCURRENCY
fi
((
point
>
low
))
||
break
probes
=
$((
probes
+
1
))
run_single_serving_probe
\
"
$test_name
"
"
$qps
"
"
$point
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
rc
=
$?
if
((
rc
==
0
))
;
then
low
=
$point
((
point
==
ADAPTIVE_MAX_CONCURRENCY
))
&&
break
step_hint
=
$((
step_hint
*
2
))
if
((
step_hint < 1
))
;
then
step_hint
=
1
;
fi
elif
((
rc
==
1
))
;
then
high
=
$point
break
else
break
fi
done
if
[[
-n
"
$high
"
]]
;
then
while
((
low + 1 < high
))
&&
((
probes < ADAPTIVE_MAX_PROBES
))
;
do
mid
=
$((
(
low
+
high
)
/
2
))
probes
=
$((
probes
+
1
))
run_single_serving_probe
\
"
$test_name
"
"
$qps
"
"
$mid
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
rc
=
$?
if
((
rc
==
0
))
;
then
low
=
$mid
elif
((
rc
==
1
))
;
then
high
=
$mid
else
break
fi
done
fi
write_adaptive_summary_json
"
$summary_file
"
"
$test_name
"
"
$qps
"
"
$static_last_pass
"
""
"
$low
"
"
$high
"
}
run_benchmark_tests
()
{
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
...
...
@@ -252,37 +559,16 @@ run_benchmark_tests() {
done
}
run_latency_tests
()
{
run_benchmark_tests
"latency"
"
$1
"
}
run_startup_tests
()
{
run_benchmark_tests
"startup"
"
$1
"
}
run_throughput_tests
()
{
run_benchmark_tests
"throughput"
"
$1
"
}
run_serving_tests
()
{
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
run_latency_tests
()
{
run_benchmark_tests
"latency"
"
$1
"
;
}
run_startup_tests
()
{
run_benchmark_tests
"startup"
"
$1
"
;
}
run_throughput_tests
()
{
run_benchmark_tests
"throughput"
"
$1
"
;
}
local
serving_test_file
serving_test_file
=
$1
# Iterate over serving tests
jq
-c
'
merge_serving_tests_stream
()
{
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local
serving_test_file
=
"
$1
"
# shellcheck disable=SC2016
local
merged
=
'
if type == "array" then
# Plain format: test cases array
.[]
...
...
@@ -304,7 +590,50 @@ run_serving_tests() {
else
error("Unsupported serving test file format: must be array or object with .tests")
end
'
"
$serving_test_file
"
|
while
read
-r
params
;
do
'
jq
-c
"
$merged
"
"
$serving_test_file
"
|
\
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
&&
(
"
${
MODEL_FILTER
}${
DTYPE_FILTER
}
"
!=
""
)
]]
;
then
jq
-c
--arg
model
"
$MODEL_FILTER
"
--arg
dtype
"
$DTYPE_FILTER
"
'
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests
()
{
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local
serving_test_file
serving_test_file
=
$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
&&
(
"
${
MODEL_FILTER
}${
DTYPE_FILTER
}
"
!=
""
)
]]
;
then
local
count
count
=
$(
merge_serving_tests_stream
"
$serving_test_file
"
|
wc
-l
|
tr
-d
' '
)
if
[[
"
$count
"
-eq
0
]]
;
then
echo
"No matching serving tests found in
$serving_test_file
for model='
$MODEL_FILTER
' dtype='
$DTYPE_FILTER
'."
>
&2
return
0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
if
[[
!
"
$test_name
"
=
~ ^serving_
]]
;
then
...
...
@@ -323,10 +652,48 @@ run_serving_tests() {
server_envs
=
$(
echo
"
$params
"
| jq
-r
'.server_environment_variables'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model
=
$(
echo
"
$server_params
"
| jq
-r
'.model // empty'
)
if
[[
-z
"
$server_model
"
||
"
$server_model
"
==
"null"
]]
;
then
echo
"Error: serving test '
$test_name
' is missing server_parameters.model"
>
&2
exit
1
fi
server_params_no_model
=
$(
echo
"
$server_params
"
| jq
-c
'del(.model)'
)
server_args
=
$(
json2args
"
$server_params_no_model
"
)
server_envs
=
$(
json2envs
"
$server_envs
"
)
client_args
=
$(
json2args
"
$client_params
"
)
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY
=
"
${
PROMPTS_PER_CONCURRENCY
-
}
"
# no default on purpose
MIN_NUM_PROMPTS
=
"
${
MIN_NUM_PROMPTS
:-
1
}
"
MAX_NUM_PROMPTS
=
"
${
MAX_NUM_PROMPTS
:-
1000000
}
"
if
[[
-n
"
${
PROMPTS_PER_CONCURRENCY
}
"
]]
;
then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np
=
"
$(
printf
' %s '
"
$client_args
"
\
|
sed
-E
\
-e
's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g'
\
-e
's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)
"
# normalize whitespace
client_args_no_np
=
"
$(
echo
"
$client_args_no_np
"
|
tr
-s
' '
|
sed
-E
's/^ //; s/ $//'
)
"
client_args_no_np
=
"
$(
echo
"
$client_args_no_np
"
| xargs
)
"
client_args_effective
=
"
$client_args_no_np
"
else
client_args_effective
=
"
$client_args
"
fi
# qps_list
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
...
...
@@ -358,14 +725,13 @@ run_serving_tests() {
fi
# check if server model and client model is aligned
server_model
=
$(
echo
"
$server_params
"
| jq
-r
'.model'
)
client_model
=
$(
echo
"
$client_params
"
| jq
-r
'.model'
)
if
[[
$server_model
!=
"
$client_model
"
]]
;
then
echo
"Server model and client model must be the same. Skip testcase
$test_name
."
continue
fi
server_command
=
"
$server_envs
vllm serve
\
server_command
=
"
$server_envs
vllm serve
$server_model
\
$server_args
"
# run the server
...
...
@@ -373,7 +739,7 @@ run_serving_tests() {
echo
"Server command:
$server_command
"
# support remote vllm server
client_remote_args
=
""
if
[[
-z
"
${
REMOTE_HOST
}
"
]]
;
then
if
[[
-z
"
${
REMOTE_HOST
}
"
&&
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$server_command
"
&
server_pid
=
$!
# wait until the server is alive
...
...
@@ -384,6 +750,9 @@ run_serving_tests() {
echo
""
echo
"vLLM failed to start within the timeout period."
fi
elif
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
]]
;
then
# dry-run: don't start server
echo
"Dry Run."
else
server_command
=
"Using Remote Server
$REMOTE_HOST
$REMOTE_PORT
"
if
[[
${
REMOTE_PORT
}
]]
;
then
...
...
@@ -402,15 +771,21 @@ run_serving_tests() {
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
# iterate over different max_concurrency
for
max_concurrency
in
$max_concurrency_list
;
do
new_test_name
=
$
test_name
"
_qps_
"
$qps
"
_concurrency_
"
$max_concurrency
new_test_name
=
"
${
test_name
}
_qps_
$
{
qps
}
_concurrency_
$
{
max_concurrency
}
"
echo
" new test name
$new_test_name
"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg
=
""
if
[[
-n
"
${
PROMPTS_PER_CONCURRENCY
}
"
]]
;
then
num_prompts
=
$((
max_concurrency
*
PROMPTS_PER_CONCURRENCY
))
if
((
num_prompts < MIN_NUM_PROMPTS
))
;
then
num_prompts
=
$MIN_NUM_PROMPTS
;
fi
if
((
num_prompts
>
MAX_NUM_PROMPTS
))
;
then
num_prompts
=
$MAX_NUM_PROMPTS
;
fi
num_prompts_arg
=
"--num-prompts
$num_prompts
"
fi
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
client_command
=
"vllm bench serve
\
...
...
@@ -419,13 +794,16 @@ run_serving_tests() {
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--max-concurrency
$max_concurrency
\
$num_prompts_arg
\
--metadata tensor_parallel_size=
$tp
compilation_config.mode=
$compilation_config_mode
optimization_level=
$optimization_level
\
$client_args
$client_remote_args
"
$client_args
_effective
$client_remote_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
bash
-c
"
$client_command
"
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$client_command
"
fi
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
...
...
@@ -440,15 +818,23 @@ run_serving_tests() {
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
adaptive_refine_from_static_results
\
"
$test_name
"
"
$qps
"
"
$max_concurrency_list
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
done
# clean up
kill
-9
$server_pid
kill_gpu_processes
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
kill
-9
"
$server_pid
"
kill_gpu_processes
fi
done
}
main
()
{
local
ARCH
ARCH
=
''
if
[[
"
$ON_CPU
"
==
"1"
]]
;
then
...
...
@@ -458,7 +844,13 @@ main() {
check_gpus
ARCH
=
"
$arch_suffix
"
fi
check_hf_token
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
check_hf_token
else
echo
"DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
...
...
@@ -479,11 +871,16 @@ main() {
# dump vllm info via vllm collect-env
env_output
=
$(
vllm collect-env
)
echo
"
$env_output
"
>
"
$RESULTS_FOLDER
/vllm_env.txt"
# benchmarking
run_serving_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
SERVING_JSON
:-
serving
-tests
$ARCH
.json
}
"
run_serving_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
SERVING_JSON
:-
serving
-tests
$ARCH
.json
}
"
||
exit
$?
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
]]
;
then
echo
"DRY_RUN=1 -> skip latency/startup/throughput suites"
exit
0
fi
run_latency_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
LATENCY_JSON
:-
latency
-tests
$ARCH
.json
}
"
run_startup_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
STARTUP_JSON
:-
startup
-tests
$ARCH
.json
}
"
run_throughput_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
THROUGHPUT_JSON
:-
throughput
-tests
$ARCH
.json
}
"
...
...
@@ -491,6 +888,7 @@ main() {
# postprocess benchmarking results
pip
install
tabulate pandas
python3
$QUICK_BENCHMARK_ROOT
/scripts/convert-results-json-to-markdown.py
python3
$QUICK_BENCHMARK_ROOT
/scripts/compare-json-results.py
-f
$RESULTS_FOLDER
/benchmark_results.json
upload_to_buildkite
}
...
...
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
View file @
3fb4b5fa
...
...
@@ -51,5 +51,56 @@
"max-model-len"
:
256
,
"async-scheduling"
:
""
}
},
{
"test_name"
:
"latency_deepseek_r1"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"deepseek-ai/DeepSeek-R1"
,
"tensor_parallel_size"
:
8
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
"dtype"
:
"bfloat16"
}
},
{
"test_name"
:
"latency_llama4_maverick_17b128e_instruct_fp8"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
"tensor_parallel_size"
:
8
,
"max-model-len"
:
512
,
"max-num-seqs"
:
128
,
"async-scheduling"
:
""
,
"gpu-memory-utilization"
:
0.95
,
"enable_expert_parallel"
:
""
}
},
{
"test_name"
:
"latency_qwen3_8b"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
128
,
"dtype"
:
"bfloat16"
,
"async-scheduling"
:
""
}
}
]
.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
},
"server_parameters"
:
{
"dtype"
:
"bfloat16"
,
"model"
:
"openai/whisper-large-v3-turbo"
},
"client_parameters"
:
{
"model"
:
"openai/whisper-large-v3-turbo"
,
"backend"
:
"openai-audio"
,
"endpoint"
:
"/v1/audio/transcriptions"
,
"dataset_name"
:
"hf"
,
"dataset_path"
:
"openslr/librispeech_asr"
,
"hf_subset"
:
"clean"
,
"hf_split"
:
"test"
,
"no_stream"
:
""
,
"no_oversample"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_whisper_large_v3_turbo_librispeech_clean_tp1"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
32
,
64
,
128
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"dtype"
:
"bfloat16"
,
"model"
:
"jinaai/jina-embeddings-v3"
,
"trust_remote_code"
:
""
},
"client_parameters"
:
{
"model"
:
"jinaai/jina-embeddings-v3"
,
"backend"
:
"openai-embeddings"
,
"endpoint"
:
"/v1/embeddings"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_jina_embed_v3_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"backend"
:
"vllm"
,
"ignore-eos"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp2_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp2_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp2_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama3B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_granite2B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen1.7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen4B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen8B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_glm9B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_gemma7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
View file @
3fb4b5fa
...
...
@@ -72,17 +72,6 @@
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
...
...
@@ -105,17 +94,6 @@
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
...
...
@@ -139,144 +117,25 @@
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"test_name"
:
"serving_llama8B_tp1_random_2048_2048"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
1
28
,
"random-output-len"
:
1
28
"random-input-len"
:
2
04
8
,
"random-output-len"
:
2
04
8
}
},
{
"test_name"
:
"serving_llama8B_
int4_
tp2_random_
128_12
8"
,
"test_name"
:
"serving_llama8B_tp2_random_
2048_204
8"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama3B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_granite2B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen1.7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen4B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen8B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_glm9B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_gemma7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
}
]
...
...
Prev
1
2
3
4
5
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment