Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1630 additions
and
374 deletions
+1630
-374
.buildkite/hardware_tests/amd.yaml
.buildkite/hardware_tests/amd.yaml
+3
-2
.buildkite/hardware_tests/cpu.yaml
.buildkite/hardware_tests/cpu.yaml
+14
-0
.buildkite/image_build/image_build.sh
.buildkite/image_build/image_build.sh
+11
-12
.buildkite/image_build/image_build.yaml
.buildkite/image_build/image_build.yaml
+1
-2
.buildkite/image_build/image_build_cpu.sh
.buildkite/image_build/image_build_cpu.sh
+6
-8
.buildkite/image_build/image_build_cpu_arm64.sh
.buildkite/image_build/image_build_cpu_arm64.sh
+5
-5
.buildkite/image_build/image_build_hpu.sh
.buildkite/image_build/image_build_hpu.sh
+5
-5
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
.../lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+2
-2
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+1
-1
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+1
-1
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
...kite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+2
-5
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+8
-2
.buildkite/performance-benchmarks/README.md
.buildkite/performance-benchmarks/README.md
+0
-1
.buildkite/performance-benchmarks/scripts/compare-json-results.py
...te/performance-benchmarks/scripts/compare-json-results.py
+628
-126
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
...formance-benchmarks/scripts/run-performance-benchmarks.sh
+453
-55
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
...dkite/performance-benchmarks/tests/latency-tests-hpu.json
+51
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
...e/performance-benchmarks/tests/serving-tests-cpu-asr.json
+37
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
...performance-benchmarks/tests/serving-tests-cpu-embed.json
+41
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
.../performance-benchmarks/tests/serving-tests-cpu-text.json
+355
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
...dkite/performance-benchmarks/tests/serving-tests-cpu.json
+6
-147
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
.buildkite/hardware_tests/amd.yaml
View file @
3fb4b5fa
group
:
Hardware
group
:
Hardware
- AMD Build
steps
:
steps
:
-
label
:
"
AMD:
:docker:
build
image"
-
label
:
"
AMD:
:docker:
build
image"
key
:
image-build-amd
depends_on
:
[]
depends_on
:
[]
device
:
amd_cpu
device
:
amd_cpu
no_plugin
:
true
no_plugin
:
true
...
@@ -9,7 +10,7 @@ steps:
...
@@ -9,7 +10,7 @@ steps:
docker build
docker build
--build-arg max_jobs=16
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942
;gfx950
'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
-f docker/Dockerfile.rocm
...
...
.buildkite/hardware_tests/cpu.yaml
View file @
3fb4b5fa
...
@@ -21,6 +21,20 @@ steps:
...
@@ -21,6 +21,20 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
pytest -x -v -s tests/kernels/test_onednn.py"
-
label
:
CPU-Compatibility Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
-
cmake/cpu_extension.cmake
-
setup.py
-
vllm/platforms/cpu.py
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
-
label
:
CPU-Language Generation and Pooling Model Tests
-
label
:
CPU-Language Generation and Pooling Model Tests
depends_on
:
[]
depends_on
:
[]
soft_fail
:
true
soft_fail
:
true
...
...
.buildkite/image_build/image_build.sh
View file @
3fb4b5fa
...
@@ -8,7 +8,7 @@ clean_docker_tag() {
...
@@ -8,7 +8,7 @@ clean_docker_tag() {
}
}
print_usage_and_exit
()
{
print_usage_and_exit
()
{
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <
vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>
"
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <
image_tag> [<image_tag_latest>]
"
exit
1
exit
1
}
}
...
@@ -142,11 +142,16 @@ resolve_parent_commit() {
...
@@ -142,11 +142,16 @@ resolve_parent_commit() {
print_bake_config
()
{
print_bake_config
()
{
echo
"--- :page_facing_up: Resolved bake configuration"
echo
"--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE
=
"bake-config-build-
${
BUILDKITE_BUILD_NUMBER
:-
local
}
.json"
# Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local
bake_tmp
bake_tmp
=
"
$(
mktemp
-d
)
"
BAKE_CONFIG_FILE
=
"
${
bake_tmp
}
/bake-config-build-
${
BUILDKITE_BUILD_NUMBER
:-
local
}
.json"
docker buildx bake
-f
"
${
VLLM_BAKE_FILE_PATH
}
"
-f
"
${
CI_HCL_PATH
}
"
--print
"
${
TARGET
}
"
|
tee
"
${
BAKE_CONFIG_FILE
}
"
||
true
docker buildx bake
-f
"
${
VLLM_BAKE_FILE_PATH
}
"
-f
"
${
CI_HCL_PATH
}
"
--print
"
${
TARGET
}
"
|
tee
"
${
BAKE_CONFIG_FILE
}
"
||
true
echo
"Saved bake config to
${
BAKE_CONFIG_FILE
}
"
echo
"Saved bake config to
${
BAKE_CONFIG_FILE
}
"
echo
"--- :arrow_down: Uploading bake config to Buildkite"
echo
"--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload
"
${
BAKE_CONFIG_FILE
}
"
(
cd
"
$(
dirname
"
${
BAKE_CONFIG_FILE
}
"
)
"
&&
buildkite-agent artifact upload
"
$(
basename
"
${
BAKE_CONFIG_FILE
}
"
)
"
)
}
}
#################################
#################################
...
@@ -154,7 +159,7 @@ print_bake_config() {
...
@@ -154,7 +159,7 @@ print_bake_config() {
#################################
#################################
print_instance_info
print_instance_info
if
[[
$#
-lt
7
]]
;
then
if
[[
$#
-lt
5
]]
;
then
print_usage_and_exit
print_usage_and_exit
fi
fi
...
@@ -163,10 +168,8 @@ REGISTRY=$1
...
@@ -163,10 +168,8 @@ REGISTRY=$1
REPO
=
$2
REPO
=
$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
BRANCH
=
$4
BRANCH
=
$4
VLLM_USE_PRECOMPILED
=
$5
IMAGE_TAG
=
$5
VLLM_MERGE_BASE_COMMIT
=
$6
IMAGE_TAG_LATEST
=
${
6
:-}
# only used for main branch, optional
IMAGE_TAG
=
$7
IMAGE_TAG_LATEST
=
${
8
:-}
# only used for main branch, optional
# build config
# build config
TARGET
=
"test-ci"
TARGET
=
"test-ci"
...
@@ -193,8 +196,6 @@ export CACHE_FROM
...
@@ -193,8 +196,6 @@ export CACHE_FROM
export
CACHE_FROM_BASE_BRANCH
export
CACHE_FROM_BASE_BRANCH
export
CACHE_FROM_MAIN
export
CACHE_FROM_MAIN
export
CACHE_TO
export
CACHE_TO
export
VLLM_USE_PRECOMPILED
export
VLLM_MERGE_BASE_COMMIT
# print args
# print args
echo
"--- :mag: Arguments"
echo
"--- :mag: Arguments"
...
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
...
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
echo
"REPO:
${
REPO
}
"
echo
"REPO:
${
REPO
}
"
echo
"BUILDKITE_COMMIT:
${
BUILDKITE_COMMIT
}
"
echo
"BUILDKITE_COMMIT:
${
BUILDKITE_COMMIT
}
"
echo
"BRANCH:
${
BRANCH
}
"
echo
"BRANCH:
${
BRANCH
}
"
echo
"VLLM_USE_PRECOMPILED:
${
VLLM_USE_PRECOMPILED
}
"
echo
"VLLM_MERGE_BASE_COMMIT:
${
VLLM_MERGE_BASE_COMMIT
}
"
echo
"IMAGE_TAG:
${
IMAGE_TAG
}
"
echo
"IMAGE_TAG:
${
IMAGE_TAG
}
"
echo
"IMAGE_TAG_LATEST:
${
IMAGE_TAG_LATEST
}
"
echo
"IMAGE_TAG_LATEST:
${
IMAGE_TAG_LATEST
}
"
...
...
.buildkite/image_build/image_build.yaml
View file @
3fb4b5fa
...
@@ -5,8 +5,7 @@ steps:
...
@@ -5,8 +5,7 @@ steps:
depends_on
:
[]
depends_on
:
[]
timeout_in_minutes
:
600
timeout_in_minutes
:
600
commands
:
commands
:
-
if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
-
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
-
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry
:
retry
:
automatic
:
automatic
:
-
exit_status
:
-1
# Agent was lost
-
exit_status
:
-1
# Agent was lost
...
...
.buildkite/image_build/image_build_cpu.sh
View file @
3fb4b5fa
...
@@ -11,10 +11,10 @@ REPO=$2
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
echo
"Image not found, proceeding with build..."
else
else
echo
"Image found"
echo
"Image found"
...
@@ -24,13 +24,11 @@ fi
...
@@ -24,13 +24,11 @@ fi
# build
# build
docker build
--file
docker/Dockerfile.cpu
\
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--build-arg
VLLM_CPU_AVX512BF16
=
true
\
--build-arg
VLLM_CPU_X86
=
true
\
--build-arg
VLLM_CPU_AVX512VNNI
=
true
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
\
--build-arg
VLLM_CPU_AMXBF16
=
true
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--target
vllm-test
\
--target
vllm-test
\
--progress
plain
.
--progress
plain
.
# push
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
.buildkite/image_build/image_build_cpu_arm64.sh
View file @
3fb4b5fa
...
@@ -11,10 +11,10 @@ REPO=$2
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
echo
"Image not found, proceeding with build..."
else
else
echo
"Image found"
echo
"Image found"
...
@@ -24,10 +24,10 @@ fi
...
@@ -24,10 +24,10 @@ fi
# build
# build
docker build
--file
docker/Dockerfile.cpu
\
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
\
--target
vllm-test
\
--target
vllm-test
\
--progress
plain
.
--progress
plain
.
# push
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
.buildkite/image_build/image_build_hpu.sh
View file @
3fb4b5fa
...
@@ -11,10 +11,10 @@ REPO=$2
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
echo
"Image not found, proceeding with build..."
else
else
echo
"Image found"
echo
"Image found"
...
@@ -25,10 +25,10 @@ fi
...
@@ -25,10 +25,10 @@ fi
docker build
\
docker build
\
--file
tests/pytorch_ci_hud_benchmark/Dockerfile.hpu
\
--file
tests/pytorch_ci_hud_benchmark/Dockerfile.hpu
\
--build-arg
max_jobs
=
16
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
\
--progress
plain
\
--progress
plain
\
https://github.com/vllm-project/vllm-gaudi.git
https://github.com/vllm-project/vllm-gaudi.git
# push
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
View file @
3fb4b5fa
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
...
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks
chartqa
\
--tasks
chartqa
\
--batch_size
auto
\
--batch_size
auto
\
--apply_chat_template
\
--apply_chat_template
\
--limit
$LIMIT
--limit
"
$LIMIT
"
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
View file @
3fb4b5fa
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
# We can use this script to compute baseline accuracy on GSM for transformers.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
...
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
View file @
3fb4b5fa
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
# We use this for fp8, which HF does not support.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
...
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
View file @
3fb4b5fa
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
# We use this for fp8, which HF does not support.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
@@ -20,14 +20,11 @@ usage() {
...
@@ -20,14 +20,11 @@ usage() {
echo
echo
}
}
while
getopts
"m:
b:
l:f:t:"
OPT
;
do
while
getopts
"m:l:f:t:"
OPT
;
do
case
${
OPT
}
in
case
${
OPT
}
in
m
)
m
)
MODEL
=
"
$OPTARG
"
MODEL
=
"
$OPTARG
"
;;
;;
b
)
BATCH_SIZE
=
"
$OPTARG
"
;;
l
)
l
)
LIMIT
=
"
$OPTARG
"
LIMIT
=
"
$OPTARG
"
;;
;;
...
...
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
3fb4b5fa
...
@@ -13,9 +13,10 @@ import os
...
@@ -13,9 +13,10 @@ import os
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
import
lm_eval
import
lm_eval
import
numpy
as
np
import
yaml
import
yaml
from
vllm.platforms
import
current_platform
DEFAULT_RTOL
=
0.08
DEFAULT_RTOL
=
0.08
...
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
...
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True,"
"allow_deprecated_quantization=True,"
)
)
if
current_platform
.
is_rocm
()
and
"Nemotron-3"
in
eval_config
[
"model_name"
]:
model_args
+=
"attention_backend=TRITON_ATTN"
env_vars
=
eval_config
.
get
(
"env_vars"
,
None
)
env_vars
=
eval_config
.
get
(
"env_vars"
,
None
)
with
scoped_env_vars
(
env_vars
):
with
scoped_env_vars
(
env_vars
):
results
=
lm_eval
.
simple_evaluate
(
results
=
lm_eval
.
simple_evaluate
(
...
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
...
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f
"ground_truth=
{
ground_truth
:.
3
f
}
| "
f
"ground_truth=
{
ground_truth
:.
3
f
}
| "
f
"measured=
{
measured_value
:.
3
f
}
| rtol=
{
rtol
}
"
f
"measured=
{
measured_value
:.
3
f
}
| rtol=
{
rtol
}
"
)
)
success
=
success
and
np
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
rtol
)
min_acceptable
=
ground_truth
*
(
1
-
rtol
)
success
=
success
and
measured_value
>=
min_acceptable
assert
success
assert
success
.buildkite/performance-benchmarks/README.md
View file @
3fb4b5fa
...
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
...
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"tensor_parallel_size"
:
1
,
"tensor_parallel_size"
:
1
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
"load_format"
:
"dummy"
},
},
...
...
.buildkite/performance-benchmarks/scripts/compare-json-results.py
View file @
3fb4b5fa
...
@@ -7,8 +7,10 @@ import argparse
...
@@ -7,8 +7,10 @@ import argparse
import
html
as
_html
import
html
as
_html
import
json
import
json
import
os
import
os
from
contextlib
import
nullcontext
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
importlib
import
util
from
importlib
import
util
from
pathlib
import
Path
import
pandas
as
pd
import
pandas
as
pd
...
@@ -31,6 +33,45 @@ pd.set_option("display.precision", 2)
...
@@ -31,6 +33,45 @@ pd.set_option("display.precision", 2)
pd
.
set_option
(
"display.float_format"
,
lambda
x
:
f
"
{
x
:.
2
f
}
"
)
pd
.
set_option
(
"display.float_format"
,
lambda
x
:
f
"
{
x
:.
2
f
}
"
)
# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def
_find_concurrency_col
(
df
:
pd
.
DataFrame
)
->
str
:
for
c
in
[
"# of max concurrency."
,
"# of max concurrency"
,
"Max Concurrency"
,
"max_concurrency"
,
"Concurrency"
,
]:
if
c
in
df
.
columns
:
return
c
for
c
in
df
.
columns
:
if
"concurr"
in
str
(
c
).
lower
():
s
=
df
[
c
]
if
s
.
dtype
.
kind
in
"iu"
and
s
.
nunique
()
>
1
and
s
.
min
()
>=
1
:
return
c
raise
ValueError
(
"Cannot infer concurrency column. "
"Please rename the column to one of the known names "
"or add an explicit override (e.g., --concurrency-col)."
)
def
_normalize_concurrency_in_df
(
df
:
pd
.
DataFrame
,
canonical
:
str
=
"# of max concurrency."
)
->
pd
.
DataFrame
:
if
canonical
in
df
.
columns
:
return
df
detected
=
_find_concurrency_col
(
df
)
if
detected
in
df
.
columns
and
detected
!=
canonical
:
return
df
.
rename
(
columns
=
{
detected
:
canonical
})
df
[
canonical
]
=
pd
.
NA
return
df
# -----------------------------
# -----------------------------
# Core data compare
# Core data compare
# -----------------------------
# -----------------------------
...
@@ -50,19 +91,25 @@ def compare_data_columns(
...
@@ -50,19 +91,25 @@ def compare_data_columns(
- Concat along axis=1 (indexes align), then reset_index so callers can
- Concat along axis=1 (indexes align), then reset_index so callers can
group by columns.
group by columns.
- If --debug, add a <file_label>_name column per file.
- If --debug, add a <file_label>_name column per file.
Minimal fix to support different max_concurrency lists across files:
- normalize concurrency column naming to "# of max concurrency."
- align on UNION of keys (missing points become NaN)
- BUGFIX: don't drop throughput rows based on P99/Median presence
"""
"""
print
(
"
\n
compare_data_column:"
,
data_column
)
print
(
"
\n
compare_data_column:"
,
data_column
)
frames
=
[]
frames
=
[]
raw_data_cols
:
list
[
str
]
=
[]
raw_data_cols
:
list
[
str
]
=
[]
compare_frames
=
[]
# Determine key cols after normalizing concurrency
cols_per_file
:
list
[
set
]
=
[]
cols_per_file
:
list
[
set
]
=
[]
for
f
in
files
:
for
f
in
files
:
try
:
try
:
df_tmp
=
pd
.
read_json
(
f
,
orient
=
"records"
)
df_tmp
=
pd
.
read_json
(
f
,
orient
=
"records"
)
except
Exception
as
err
:
except
Exception
as
err
:
raise
ValueError
(
f
"Failed to read
{
f
}
"
)
from
err
raise
ValueError
(
f
"Failed to read
{
f
}
"
)
from
err
df_tmp
=
_normalize_concurrency_in_df
(
df_tmp
,
canonical
=
"# of max concurrency."
)
cols_per_file
.
append
(
set
(
df_tmp
.
columns
))
cols_per_file
.
append
(
set
(
df_tmp
.
columns
))
key_cols
=
[
c
for
c
in
info_cols
if
all
(
c
in
cset
for
cset
in
cols_per_file
)]
key_cols
=
[
c
for
c
in
info_cols
if
all
(
c
in
cset
for
cset
in
cols_per_file
)]
...
@@ -73,12 +120,25 @@ def compare_data_columns(
...
@@ -73,12 +120,25 @@ def compare_data_columns(
"No common key columns found from info_cols across the input files."
"No common key columns found from info_cols across the input files."
)
)
meta_added
=
False
union_index
=
None
metas
:
list
[
pd
.
DataFrame
]
=
[]
staged
:
list
[
tuple
[
str
,
pd
.
Series
,
pd
.
Series
|
None
]]
=
[]
for
file
in
files
:
for
file
in
files
:
df
=
pd
.
read_json
(
file
,
orient
=
"records"
)
df
=
pd
.
read_json
(
file
,
orient
=
"records"
)
df
=
_normalize_concurrency_in_df
(
df
,
canonical
=
"# of max concurrency."
)
if
drop_column
in
df
.
columns
:
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
# NaN in P99/Median columns even if the column exists in the JSON.
metric_lc
=
str
(
data_column
).
lower
()
is_latency_metric
=
(
"ttft"
in
metric_lc
or
"tpot"
in
metric_lc
or
"p99"
in
metric_lc
or
"median"
in
metric_lc
or
metric_lc
.
strip
()
in
{
"p99"
,
"median"
}
)
if
is_latency_metric
and
drop_column
in
df
.
columns
:
df
=
df
.
dropna
(
subset
=
[
drop_column
],
ignore_index
=
True
)
df
=
df
.
dropna
(
subset
=
[
drop_column
],
ignore_index
=
True
)
for
c
in
(
for
c
in
(
...
@@ -103,35 +163,61 @@ def compare_data_columns(
...
@@ -103,35 +163,61 @@ def compare_data_columns(
meta
=
meta
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
meta
=
meta
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
file_label
=
"/"
.
join
(
file
.
split
(
"/"
)[:
-
1
])
or
os
.
path
.
basename
(
file
)
file_label
=
"/"
.
join
(
file
.
split
(
"/"
)[:
-
1
])
or
os
.
path
.
basename
(
file
)
s
=
df_idx
[
data_column
]
if
not
s
.
index
.
is_unique
:
s
=
s
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
mean
()
s
.
name
=
file_label
if
not
meta_added
:
if
data_column
in
df_idx
.
columns
:
frames
.
append
(
meta
)
s
=
df_idx
[
data_column
]
meta_added
=
True
if
not
s
.
index
.
is_unique
:
s
=
s
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
mean
()
else
:
# keep NA series to preserve meta keys for union_index
s
=
pd
.
Series
(
pd
.
NA
,
index
=
meta
.
index
)
s
.
name
=
file_label
name_s
=
None
if
debug
and
name_column
in
df_idx
.
columns
:
if
debug
and
name_column
in
df_idx
.
columns
:
name_s
=
df_idx
[
name_column
]
name_s
=
df_idx
[
name_column
]
if
not
name_s
.
index
.
is_unique
:
if
not
name_s
.
index
.
is_unique
:
name_s
=
name_s
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
name_s
=
name_s
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
name_s
.
name
=
f
"
{
file_label
}
_name"
name_s
.
name
=
f
"
{
file_label
}
_name"
frames
.
append
(
name_s
)
frames
.
append
(
s
)
if
union_index
is
None
:
union_index
=
meta
.
index
else
:
union_index
=
union_index
.
union
(
meta
.
index
)
metas
.
append
(
meta
)
staged
.
append
((
file_label
,
s
,
name_s
))
if
union_index
is
None
:
raise
ValueError
(
"No data found after loading inputs."
)
# meta first (union-aligned): build UNION meta across all files
if
metas
:
meta_union
=
pd
.
concat
(
metas
,
axis
=
0
)
# Collapse duplicates on the MultiIndex; keep first non-null per column
meta_union
=
meta_union
.
groupby
(
level
=
key_cols
,
dropna
=
False
).
first
()
frames
.
append
(
meta_union
.
reindex
(
union_index
))
# values + ratios (union-aligned)
metric_series_aligned
:
list
[
pd
.
Series
]
=
[]
for
file_label
,
s
,
name_s
in
staged
:
s_aligned
=
s
.
reindex
(
union_index
)
frames
.
append
(
s_aligned
)
raw_data_cols
.
append
(
file_label
)
raw_data_cols
.
append
(
file_label
)
compare_frames
.
append
(
s
)
metric_series_aligned
.
append
(
s_aligned
)
if
debug
and
name_s
is
not
None
:
frames
.
append
(
name_s
.
reindex
(
union_index
))
if
len
(
compare_frames
)
>=
2
:
if
len
(
metric_series_aligned
)
>=
2
:
base
=
compare_frames
[
0
]
base
=
metric_series_aligned
[
0
]
current
=
compare_frames
[
-
1
]
current
=
metric_series_aligned
[
-
1
]
if
"P99"
in
data_column
or
"Median"
in
data_column
:
if
"P99"
in
str
(
data_column
)
or
"Median"
in
str
(
data_column
)
:
ratio
=
base
/
current
ratio
=
base
/
current
else
:
else
:
ratio
=
current
/
base
ratio
=
current
/
base
ratio
=
ratio
.
mask
(
base
==
0
)
ratio
=
ratio
.
mask
(
base
==
0
)
ratio
.
name
=
f
"Ratio 1 vs
{
len
(
compare_frames
)
}
"
ratio
.
name
=
f
"Ratio 1 vs
{
len
(
metric_series_aligned
)
}
"
frames
.
append
(
ratio
)
frames
.
append
(
ratio
)
concat_df
=
pd
.
concat
(
frames
,
axis
=
1
).
reset_index
(
drop
=
True
)
concat_df
=
pd
.
concat
(
frames
,
axis
=
1
).
reset_index
(
drop
=
True
)
...
@@ -202,24 +288,10 @@ def split_json_by_tp_pp(
...
@@ -202,24 +288,10 @@ def split_json_by_tp_pp(
# -----------------------------
# -----------------------------
# Styling helpers
# Styling helpers
# -----------------------------
# -----------------------------
def
_find_concurrency_col
(
df
:
pd
.
DataFrame
)
->
str
:
for
c
in
[
"# of max concurrency."
,
"# of max concurrency"
,
"Max Concurrency"
,
"max_concurrency"
,
"Concurrency"
,
]:
if
c
in
df
.
columns
:
return
c
for
c
in
df
.
columns
:
if
df
[
c
].
dtype
.
kind
in
"iu"
and
df
[
c
].
nunique
()
>
1
and
df
[
c
].
min
()
>=
1
:
return
c
return
"# of max concurrency."
def
_highlight_threshold
(
def
_highlight_threshold
(
df
:
pd
.
DataFrame
,
threshold
:
float
df
:
pd
.
DataFrame
,
threshold
:
float
,
slack_pct
:
float
=
0.0
,
)
->
pd
.
io
.
formats
.
style
.
Styler
:
)
->
pd
.
io
.
formats
.
style
.
Styler
:
conc_col
=
_find_concurrency_col
(
df
)
conc_col
=
_find_concurrency_col
(
df
)
key_cols
=
[
key_cols
=
[
...
@@ -232,12 +304,24 @@ def _highlight_threshold(
...
@@ -232,12 +304,24 @@ def _highlight_threshold(
]
]
conf_cols
=
[
c
for
c
in
conf_cols
if
pd
.
api
.
types
.
is_numeric_dtype
(
df
[
c
])]
conf_cols
=
[
c
for
c
in
conf_cols
if
pd
.
api
.
types
.
is_numeric_dtype
(
df
[
c
])]
return
df
.
style
.
map
(
try
:
lambda
v
:
"background-color:#e6ffe6;font-weight:bold;"
slack_pct
=
float
(
slack_pct
or
0.0
)
if
pd
.
notna
(
v
)
and
v
<=
threshold
except
Exception
:
else
""
,
slack_pct
=
0.0
subset
=
conf_cols
,
slack_limit
=
threshold
*
(
1.0
+
slack_pct
/
100.0
)
)
def
_cell
(
v
):
if
pd
.
isna
(
v
):
return
""
if
v
<=
threshold
:
# Strict SLA
return
"background-color:#e6ffe6;font-weight:bold;"
if
v
<=
slack_limit
:
# Within slack range
return
"background-color:#ffe5cc;font-weight:bold;"
return
""
return
df
.
style
.
map
(
_cell
,
subset
=
conf_cols
)
def
highlight_ratio_columns
(
styler
:
pd
.
io
.
formats
.
style
.
Styler
):
def
highlight_ratio_columns
(
styler
:
pd
.
io
.
formats
.
style
.
Styler
):
...
@@ -275,6 +359,177 @@ def _apply_two_decimals(
...
@@ -275,6 +359,177 @@ def _apply_two_decimals(
return
styler
.
format
({
c
:
"{:.2f}"
for
c
in
num_cols
},
na_rep
=
""
)
return
styler
.
format
({
c
:
"{:.2f}"
for
c
in
num_cols
},
na_rep
=
""
)
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def
_sanitize_sheet_name
(
name
:
str
)
->
str
:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
module's compile overhead/edge-cases on some systems.
"""
name
=
"sheet"
if
name
is
None
else
str
(
name
)
# Replace illegal characters with underscore.
trans
=
str
.
maketrans
(
{
":"
:
"_"
,
"
\\
"
:
"_"
,
"/"
:
"_"
,
"?"
:
"_"
,
"*"
:
"_"
,
"["
:
"_"
,
"]"
:
"_"
,
}
)
name
=
name
.
translate
(
trans
)
# Strip quotes/spaces and collapse whitespace.
name
=
name
.
strip
().
strip
(
"'"
)
name
=
" "
.
join
(
name
.
split
())
if
not
name
:
name
=
"sheet"
return
name
[:
31
]
def
_group_to_sheet_base
(
group_cols
:
list
[
str
],
gkey_tuple
)
->
str
:
d
=
dict
(
zip
(
group_cols
,
gkey_tuple
))
# Always keep input/output lengths (these are important).
ilen
=
d
.
get
(
"Input Len"
,
""
)
olen
=
d
.
get
(
"Output Len"
,
""
)
lens
=
f
"_
{
ilen
}
x
{
olen
}
"
if
ilen
!=
""
and
olen
!=
""
else
""
# Shorten model name aggressively to make room for lens.
model
=
d
.
get
(
"Model"
,
"model"
)
leaf
=
str
(
model
).
split
(
"/"
)[
-
1
]
max_model_len
=
max
(
1
,
31
-
len
(
lens
))
model_short
=
leaf
[:
max_model_len
]
return
_sanitize_sheet_name
(
f
"
{
model_short
}{
lens
}
"
)
def
_write_tables_to_excel_sheet
(
writer
:
pd
.
ExcelWriter
,
sheet
:
str
,
blocks
:
list
[
tuple
[
str
,
pd
.
DataFrame
]]
):
"""Write all blocks to a sheet with a single to_excel() call.
Pandas+openpyxl can be extremely slow when called many times per sheet.
We flatten blocks into one table with a 'Section' column to keep structure
while making Excel generation fast and deterministic.
"""
if
not
blocks
:
pd
.
DataFrame
().
to_excel
(
writer
,
sheet_name
=
sheet
,
index
=
False
)
return
combined_parts
:
list
[
pd
.
DataFrame
]
=
[]
for
title
,
df
in
blocks
:
df2
=
df
.
copy
()
# Put the section label as the first column for readability.
df2
.
insert
(
0
,
"Section"
,
title
)
combined_parts
.
append
(
df2
)
combined
=
pd
.
concat
(
combined_parts
,
axis
=
0
,
ignore_index
=
True
,
sort
=
False
)
combined
.
to_excel
(
writer
,
sheet_name
=
sheet
,
index
=
False
)
def
_safe_filename
(
s
:
str
)
->
str
:
# Fast path without the third-party `regex` module.
s
=
" "
.
join
(
str
(
s
).
strip
().
split
())
allowed
=
[]
for
ch
in
s
:
if
ch
.
isalnum
()
or
ch
in
"._-"
:
allowed
.
append
(
ch
)
else
:
allowed
.
append
(
"_"
)
out
=
""
.
join
(
allowed
)
return
out
[:
180
]
if
len
(
out
)
>
180
else
out
# -----------------------------
# vLLM environment export helper
# -----------------------------
def
_parse_vllm_env_txt
(
env_path
:
Path
)
->
pd
.
DataFrame
:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines
=
env_path
.
read_text
(
encoding
=
"utf-8"
,
errors
=
"replace"
).
splitlines
()
section
=
"General"
rows
:
list
[
dict
]
=
[]
def
set_section
(
s
:
str
):
nonlocal
section
s
=
(
s
or
""
).
strip
()
if
s
:
section
=
s
for
raw
in
lines
:
stripped
=
raw
.
strip
()
if
not
stripped
:
continue
# divider lines like =====
if
set
(
stripped
)
<=
{
"="
}:
continue
# section header heuristic: short standalone line
if
":"
not
in
stripped
and
"="
not
in
stripped
and
len
(
stripped
)
<=
64
:
if
stripped
.
lower
().
startswith
(
"collecting environment information"
):
continue
set_section
(
stripped
)
continue
# env var style: KEY=VALUE (and not a URL with :)
if
"="
in
stripped
and
":"
not
in
stripped
:
k
,
v
=
stripped
.
split
(
"="
,
1
)
k
=
k
.
strip
()
v
=
v
.
strip
()
if
k
:
rows
.
append
({
"Section"
:
section
,
"Key"
:
k
,
"Value"
:
v
})
continue
# key: value
if
":"
in
stripped
:
k
,
v
=
stripped
.
split
(
":"
,
1
)
k
=
k
.
strip
()
v
=
v
.
strip
()
if
k
:
rows
.
append
({
"Section"
:
section
,
"Key"
:
k
,
"Value"
:
v
})
continue
return
pd
.
DataFrame
(
rows
,
columns
=
[
"Section"
,
"Key"
,
"Value"
])
def
_load_env_df_for_inputs
(
args
,
files
:
list
[
str
])
->
pd
.
DataFrame
|
None
:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir
:
Path
|
None
=
None
if
getattr
(
args
,
"file"
,
None
):
base_dir
=
Path
(
args
.
file
[
0
]).
resolve
().
parent
elif
files
:
base_dir
=
Path
(
files
[
0
]).
resolve
().
parent
if
base_dir
is
None
:
return
None
env_path
=
base_dir
/
"vllm_env.txt"
if
not
env_path
.
exists
():
return
None
df
=
_parse_vllm_env_txt
(
env_path
)
return
df
# -----------------------------
# -----------------------------
# Valid max concurrency summary helpers
# Valid max concurrency summary helpers
# -----------------------------
# -----------------------------
...
@@ -301,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
...
@@ -301,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
def
_max_concurrency_ok
(
def
_max_concurrency_ok
(
df
:
pd
.
DataFrame
,
conc_col
:
str
,
cfg_col
:
str
,
threshold
:
float
df
:
pd
.
DataFrame
,
conc_col
:
str
,
cfg_col
:
str
,
threshold
:
float
,
slack_pct
:
float
=
0.0
,
):
):
if
df
is
None
or
conc_col
not
in
df
.
columns
or
cfg_col
not
in
df
.
columns
:
if
df
is
None
or
conc_col
not
in
df
.
columns
or
cfg_col
not
in
df
.
columns
:
return
pd
.
NA
return
pd
.
NA
...
@@ -314,7 +573,14 @@ def _max_concurrency_ok(
...
@@ -314,7 +573,14 @@ def _max_concurrency_ok(
if
d
.
empty
:
if
d
.
empty
:
return
pd
.
NA
return
pd
.
NA
ok
=
d
[
d
[
cfg_col
]
<=
threshold
]
# Accept values up to (1 + slack_pct%) above the SLA.
try
:
slack_pct
=
float
(
slack_pct
or
0.0
)
except
Exception
:
slack_pct
=
0.0
effective_limit
=
float
(
threshold
)
*
(
1.0
+
slack_pct
/
100.0
)
ok
=
d
[
d
[
cfg_col
]
<=
effective_limit
]
if
ok
.
empty
:
if
ok
.
empty
:
return
pd
.
NA
return
pd
.
NA
...
@@ -380,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
...
@@ -380,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
if
not
cfg_cols
:
if
not
cfg_cols
:
cfg_cols
=
sorted
(
set
(
ttft_cols
)
|
set
(
tpot_cols
)
|
set
(
tput_cols
),
key
=
str
)
cfg_cols
=
sorted
(
set
(
ttft_cols
)
|
set
(
tpot_cols
)
|
set
(
tput_cols
),
key
=
str
)
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
ttft_hi
=
args
.
ttft_max_ms
*
(
1.0
+
args
.
ttft_slack_pct
/
100.0
)
tpot_hi
=
args
.
tpot_max_ms
*
(
1.0
+
args
.
tpot_slack_pct
/
100.0
)
ttft_range
=
f
"
{
args
.
ttft_max_ms
:
g
}
–
{
ttft_hi
:
g
}
ms (+
{
args
.
ttft_slack_pct
:
g
}
%)"
tpot_range
=
f
"
{
args
.
tpot_max_ms
:
g
}
–
{
tpot_hi
:
g
}
ms (+
{
args
.
tpot_slack_pct
:
g
}
%)"
rows
=
[]
rows
=
[]
for
cfg
in
cfg_cols
:
for
cfg
in
cfg_cols
:
ttft_max
=
(
ttft_max
=
(
_max_concurrency_ok
(
ttft_group_df
,
conc_col
,
cfg
,
args
.
ttft_max_ms
)
_max_concurrency_ok
(
ttft_group_df
,
conc_col
,
cfg
,
args
.
ttft_max_ms
,
args
.
ttft_slack_pct
)
if
ttft_group_df
is
not
None
if
ttft_group_df
is
not
None
else
pd
.
NA
else
pd
.
NA
)
)
tpot_max
=
(
tpot_max
=
(
_max_concurrency_ok
(
tpot_group_df
,
conc_col
,
cfg
,
args
.
tpot_max_ms
)
_max_concurrency_ok
(
tpot_group_df
,
conc_col
,
cfg
,
args
.
tpot_max_ms
,
args
.
tpot_slack_pct
)
if
tpot_group_df
is
not
None
if
tpot_group_df
is
not
None
else
pd
.
NA
else
pd
.
NA
)
)
...
@@ -417,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
...
@@ -417,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
rows
.
append
(
rows
.
append
(
{
{
"Configuration"
:
cfg
,
"Configuration"
:
cfg
,
f
"Max
{
conc_col
}
(TTFT ≤
{
args
.
ttft_max_ms
:
g
}
ms
)"
:
ttft_max
,
f
"Max
{
conc_col
}
(TTFT ≤
{
ttft_range
}
)"
:
ttft_max
,
f
"Max
{
conc_col
}
(TPOT ≤
{
args
.
tpot_max_ms
:
g
}
ms
)"
:
tpot_max
,
f
"Max
{
conc_col
}
(TPOT ≤
{
tpot_range
}
)"
:
tpot_max
,
f
"Max
{
conc_col
}
(Both)"
:
both
,
f
"Max
{
conc_col
}
(Both)"
:
both
,
"Output Tput @ Both (tok/s)"
:
tput_at_both
,
"Output Tput @ Both (tok/s)"
:
tput_at_both
,
"TTFT @ Both (ms)"
:
ttft_at_both
,
"TTFT @ Both (ms)"
:
ttft_at_both
,
...
@@ -428,7 +704,6 @@ def build_valid_max_concurrency_summary_html(
...
@@ -428,7 +704,6 @@ def build_valid_max_concurrency_summary_html(
summary_df
=
pd
.
DataFrame
(
rows
)
summary_df
=
pd
.
DataFrame
(
rows
)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for
c
in
summary_df
.
columns
:
for
c
in
summary_df
.
columns
:
if
c
==
"Configuration"
:
if
c
==
"Configuration"
:
continue
continue
...
@@ -436,12 +711,10 @@ def build_valid_max_concurrency_summary_html(
...
@@ -436,12 +711,10 @@ def build_valid_max_concurrency_summary_html(
both_col
=
f
"Max
{
conc_col
}
(Both)"
both_col
=
f
"Max
{
conc_col
}
(Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters
=
{}
formatters
=
{}
for
c
in
summary_df
.
columns
:
for
c
in
summary_df
.
columns
:
if
c
==
"Configuration"
:
if
c
==
"Configuration"
:
continue
continue
# default argument binds per-column formatter correctly
formatters
[
c
]
=
lambda
v
:
""
if
pd
.
isna
(
v
)
else
f
"
{
float
(
v
):.
2
f
}
"
formatters
[
c
]
=
lambda
v
:
""
if
pd
.
isna
(
v
)
else
f
"
{
float
(
v
):.
2
f
}
"
styler
=
summary_df
.
style
.
format
(
formatters
)
styler
=
summary_df
.
style
.
format
(
formatters
)
...
@@ -460,6 +733,104 @@ def build_valid_max_concurrency_summary_html(
...
@@ -460,6 +733,104 @@ def build_valid_max_concurrency_summary_html(
return
title
+
styler
.
to_html
(
table_attributes
=
'border="1" class="dataframe"'
)
return
title
+
styler
.
to_html
(
table_attributes
=
'border="1" class="dataframe"'
)
def
build_valid_max_concurrency_summary_df
(
tput_group_df
:
pd
.
DataFrame
|
None
,
ttft_group_df
:
pd
.
DataFrame
|
None
,
tpot_group_df
:
pd
.
DataFrame
|
None
,
conc_col
:
str
,
args
,
)
->
pd
.
DataFrame
|
None
:
if
ttft_group_df
is
None
and
tpot_group_df
is
None
:
return
None
ttft_cols
=
(
_config_value_columns
(
ttft_group_df
,
conc_col
)
if
ttft_group_df
is
not
None
else
[]
)
tpot_cols
=
(
_config_value_columns
(
tpot_group_df
,
conc_col
)
if
tpot_group_df
is
not
None
else
[]
)
tput_cols
=
(
_config_value_columns
(
tput_group_df
,
conc_col
)
if
tput_group_df
is
not
None
else
[]
)
if
ttft_group_df
is
not
None
and
tpot_group_df
is
not
None
:
cfg_cols
=
[
c
for
c
in
ttft_cols
if
c
in
tpot_cols
]
if
tput_group_df
is
not
None
:
cfg_cols
=
[
c
for
c
in
cfg_cols
if
c
in
tput_cols
]
or
cfg_cols
else
:
cfg_cols
=
ttft_cols
or
tpot_cols
if
not
cfg_cols
:
cfg_cols
=
sorted
(
set
(
ttft_cols
)
|
set
(
tpot_cols
)
|
set
(
tput_cols
),
key
=
str
)
ttft_hi
=
args
.
ttft_max_ms
*
(
1.0
+
args
.
ttft_slack_pct
/
100.0
)
tpot_hi
=
args
.
tpot_max_ms
*
(
1.0
+
args
.
tpot_slack_pct
/
100.0
)
ttft_range
=
f
"
{
args
.
ttft_max_ms
:
g
}
–
{
ttft_hi
:
g
}
ms (+
{
args
.
ttft_slack_pct
:
g
}
%)"
tpot_range
=
f
"
{
args
.
tpot_max_ms
:
g
}
–
{
tpot_hi
:
g
}
ms (+
{
args
.
tpot_slack_pct
:
g
}
%)"
rows
=
[]
for
cfg
in
cfg_cols
:
ttft_max
=
(
_max_concurrency_ok
(
ttft_group_df
,
conc_col
,
cfg
,
args
.
ttft_max_ms
,
args
.
ttft_slack_pct
)
if
ttft_group_df
is
not
None
else
pd
.
NA
)
tpot_max
=
(
_max_concurrency_ok
(
tpot_group_df
,
conc_col
,
cfg
,
args
.
tpot_max_ms
,
args
.
tpot_slack_pct
)
if
tpot_group_df
is
not
None
else
pd
.
NA
)
both
=
(
pd
.
NA
if
(
pd
.
isna
(
ttft_max
)
or
pd
.
isna
(
tpot_max
))
else
min
(
ttft_max
,
tpot_max
)
)
tput_at_both
=
(
_value_at_concurrency
(
tput_group_df
,
conc_col
,
cfg
,
both
)
if
tput_group_df
is
not
None
else
pd
.
NA
)
ttft_at_both
=
(
_value_at_concurrency
(
ttft_group_df
,
conc_col
,
cfg
,
both
)
if
ttft_group_df
is
not
None
else
pd
.
NA
)
tpot_at_both
=
(
_value_at_concurrency
(
tpot_group_df
,
conc_col
,
cfg
,
both
)
if
tpot_group_df
is
not
None
else
pd
.
NA
)
rows
.
append
(
{
"Configuration"
:
cfg
,
f
"Max
{
conc_col
}
(TTFT ≤
{
ttft_range
}
)"
:
ttft_max
,
f
"Max
{
conc_col
}
(TPOT ≤
{
tpot_range
}
)"
:
tpot_max
,
f
"Max
{
conc_col
}
(Both)"
:
both
,
"Output Tput @ Both (tok/s)"
:
tput_at_both
,
"TTFT @ Both (ms)"
:
ttft_at_both
,
"TPOT @ Both (ms)"
:
tpot_at_both
,
}
)
df
=
pd
.
DataFrame
(
rows
)
for
c
in
df
.
columns
:
if
c
!=
"Configuration"
:
df
[
c
]
=
pd
.
to_numeric
(
df
[
c
],
errors
=
"coerce"
)
return
df
# -----------------------------
# -----------------------------
# Plot helper
# Plot helper
# -----------------------------
# -----------------------------
...
@@ -537,6 +908,35 @@ def build_parser() -> argparse.ArgumentParser:
...
@@ -537,6 +908,35 @@ def build_parser() -> argparse.ArgumentParser:
default
=
100.0
,
default
=
100.0
,
help
=
"Reference limit for TPOT plots (ms)"
,
help
=
"Reference limit for TPOT plots (ms)"
,
)
)
# ---- SLA tolerance (slack) options ----
parser
.
add_argument
(
"--ttft-slack-pct"
,
type
=
float
,
default
=
5.0
,
help
=
"Allowed percentage above TTFT SLA (default: 5)."
,
)
parser
.
add_argument
(
"--tpot-slack-pct"
,
type
=
float
,
default
=
5.0
,
help
=
"Allowed percentage above TPOT SLA (default: 5)."
,
)
# ---- export options ----
parser
.
add_argument
(
"--excel-out"
,
type
=
str
,
default
=
"perf_comparison.xlsx"
,
help
=
"Write one sheet per (Model, Dataset, Input Len, Output Len)."
,
)
parser
.
add_argument
(
"--csv-out-dir"
,
type
=
str
,
default
=
""
,
help
=
"If set, write per-group per-metric CSVs into this directory."
,
)
return
parser
return
parser
...
@@ -615,9 +1015,13 @@ def render_metric_table_html(
...
@@ -615,9 +1015,13 @@ def render_metric_table_html(
metric_name
=
metric_label
.
lower
()
metric_name
=
metric_label
.
lower
()
if
"ttft"
in
metric_name
:
if
"ttft"
in
metric_name
:
styler
=
_highlight_threshold
(
display_group
,
args
.
ttft_max_ms
)
styler
=
_highlight_threshold
(
display_group
,
args
.
ttft_max_ms
,
args
.
ttft_slack_pct
)
elif
(
"tpot"
in
metric_name
)
or
(
"median"
in
metric_name
)
or
(
"p99"
in
metric_name
):
elif
(
"tpot"
in
metric_name
)
or
(
"median"
in
metric_name
)
or
(
"p99"
in
metric_name
):
styler
=
_highlight_threshold
(
display_group
,
args
.
tpot_max_ms
)
styler
=
_highlight_threshold
(
display_group
,
args
.
tpot_max_ms
,
args
.
tpot_slack_pct
)
else
:
else
:
styler
=
display_group
.
style
styler
=
display_group
.
style
...
@@ -657,7 +1061,6 @@ def maybe_write_plot(
...
@@ -657,7 +1061,6 @@ def maybe_write_plot(
markers
=
True
,
markers
=
True
,
)
)
# Ensure plot hover + y tick labels are also 2 decimals.
fig
.
update_traces
(
hovertemplate
=
"%{y:.2f}<extra></extra>"
)
fig
.
update_traces
(
hovertemplate
=
"%{y:.2f}<extra></extra>"
)
fig
.
update_yaxes
(
tickformat
=
".2f"
)
fig
.
update_yaxes
(
tickformat
=
".2f"
)
...
@@ -730,87 +1133,186 @@ def write_report_group_first(
...
@@ -730,87 +1133,186 @@ def write_report_group_first(
for
metric_label
,
(
df
,
_
)
in
metric_cache
.
items
()
for
metric_label
,
(
df
,
_
)
in
metric_cache
.
items
()
}
}
with
open
(
"perf_comparison.html"
,
"w"
,
encoding
=
"utf-8"
)
as
main_fh
:
csv_dir
=
Path
(
args
.
csv_out_dir
)
if
args
.
csv_out_dir
else
None
main_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
if
csv_dir
:
for
gkey
in
group_keys
:
csv_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
gkey_tuple
=
normalize_group_key
(
gkey
)
suffix
=
build_group_suffix
(
group_cols_canonical
,
gkey_tuple
)
sub_path
=
group_filename
(
gkey_tuple
)
group_header
=
(
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f
"
{
_html
.
escape
(
suffix
)
}
"
"</div>
\n
"
)
main_fh
.
write
(
group_header
)
with
open
(
sub_path
,
"w"
,
encoding
=
"utf-8"
)
as
sub_fh
:
sub_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
sub_fh
.
write
(
group_header
)
tput_group_df
=
None
ttft_group_df
=
None
tpot_group_df
=
None
conc_col
=
args
.
xaxis
for
metric_label
in
plan
.
data_cols
:
gb
=
metric_groupbys
[
metric_label
]
df_sorted
,
raw_data_cols
=
metric_cache
[
metric_label
]
try
:
group_df
=
gb
.
get_group
(
gkey
)
except
KeyError
:
missing
=
(
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f
"
{
_html
.
escape
(
metric_label
)
}
— missing for this group"
"</div>
\n
"
)
main_fh
.
write
(
missing
)
excel_path
=
args
.
excel_out
or
"perf_comparison.xlsx"
sub_fh
.
write
(
missing
)
disable_excel
=
os
.
getenv
(
"VLLM_COMPARE_DISABLE_EXCEL"
,
"0"
)
==
"1"
continue
if
conc_col
not
in
group_df
.
columns
:
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
conc_col
=
_find_concurrency_col
(
group_df
)
excel_engine
=
(
os
.
getenv
(
"VLLM_COMPARE_EXCEL_ENGINE"
,
"xlsxwriter"
).
strip
()
or
"xlsxwriter"
)
if
excel_engine
==
"xlsxwriter"
and
util
.
find_spec
(
"xlsxwriter"
)
is
None
:
excel_engine
=
"openpyxl"
excel_engine_kwargs
=
{}
if
excel_engine
==
"xlsxwriter"
:
# Reduce memory pressure & usually faster writes.
excel_engine_kwargs
=
{
"options"
:
{
"constant_memory"
:
True
}}
xw_ctx
=
(
nullcontext
(
None
)
if
disable_excel
else
pd
.
ExcelWriter
(
excel_path
,
engine
=
excel_engine
,
engine_kwargs
=
excel_engine_kwargs
)
)
with
xw_ctx
as
xw
:
used_sheets
:
set
[
str
]
=
set
()
# ---- Environment sheet (first) ----
env_sheet
=
_sanitize_sheet_name
(
"Environment"
)
env_df
=
_load_env_df_for_inputs
(
args
,
files
)
if
xw
is
not
None
:
if
env_df
is
None
or
env_df
.
empty
:
pd
.
DataFrame
(
[
{
"Section"
:
"Environment"
,
"Key"
:
"vllm_env.txt"
,
"Value"
:
"NOT FOUND (or empty)"
,
}
]
).
to_excel
(
xw
,
sheet_name
=
env_sheet
,
index
=
False
)
else
:
env_df
.
to_excel
(
xw
,
sheet_name
=
env_sheet
,
index
=
False
)
used_sheets
.
add
(
env_sheet
)
with
open
(
"perf_comparison.html"
,
"w"
,
encoding
=
"utf-8"
)
as
main_fh
:
main_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
for
gkey
in
group_keys
:
gkey_tuple
=
normalize_group_key
(
gkey
)
suffix
=
build_group_suffix
(
group_cols_canonical
,
gkey_tuple
)
sub_path
=
group_filename
(
gkey_tuple
)
group_header
=
(
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f
"
{
_html
.
escape
(
suffix
)
}
"
"</div>
\n
"
)
mn
=
metric_label
.
lower
().
strip
()
main_fh
.
write
(
group_header
)
if
"tok/s"
in
mn
:
tput_group_df
=
group_df
do_excel
=
xw
is
not
None
elif
"ttft"
in
mn
:
sheet
=
_group_to_sheet_base
(
group_cols_canonical
,
gkey_tuple
)
ttft_group_df
=
group_df
sheet_base
=
sheet
elif
mn
in
(
"p99"
,
"median"
)
or
"tpot"
in
mn
:
if
do_excel
:
tpot_group_df
=
group_df
dedup_i
=
1
while
sheet
in
used_sheets
:
dedup_i
+=
1
suffix
=
f
"_
{
dedup_i
}
"
# Ensure uniqueness even when sheet names are truncated.
base
=
str
(
sheet_base
)
keep
=
max
(
1
,
31
-
len
(
suffix
))
sheet
=
_sanitize_sheet_name
(
base
[:
keep
]
+
suffix
)
used_sheets
.
add
(
sheet
)
excel_blocks
:
list
[
tuple
[
str
,
pd
.
DataFrame
]]
=
[]
with
open
(
sub_path
,
"w"
,
encoding
=
"utf-8"
)
as
sub_fh
:
sub_fh
.
write
(
'<meta charset="utf-8">
\n
'
)
sub_fh
.
write
(
group_header
)
tput_group_df
=
None
ttft_group_df
=
None
tpot_group_df
=
None
conc_col
=
args
.
xaxis
for
metric_label
in
plan
.
data_cols
:
gb
=
metric_groupbys
[
metric_label
]
df_sorted
,
raw_data_cols
=
metric_cache
[
metric_label
]
try
:
group_df
=
gb
.
get_group
(
gkey
)
except
KeyError
:
missing
=
(
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f
"
{
_html
.
escape
(
metric_label
)
}
— missing for this group"
"</div>
\n
"
)
main_fh
.
write
(
missing
)
sub_fh
.
write
(
missing
)
continue
if
conc_col
not
in
group_df
.
columns
:
conc_col
=
_find_concurrency_col
(
group_df
)
mn
=
metric_label
.
lower
().
strip
()
if
"tok/s"
in
mn
:
tput_group_df
=
group_df
elif
"ttft"
in
mn
:
ttft_group_df
=
group_df
elif
mn
in
(
"p99"
,
"median"
)
or
"tpot"
in
mn
:
tpot_group_df
=
group_df
display_group
=
group_df
.
drop
(
columns
=
group_cols_canonical
,
errors
=
"ignore"
)
display_group
=
group_df
.
drop
(
html
=
render_metric_table_html
(
columns
=
group_cols_canonical
,
errors
=
"ignore"
display_group
,
metric_label
,
suffix
,
args
)
)
main_fh
.
write
(
html
)
sub_fh
.
write
(
html
)
maybe_write_plot
(
main_fh
,
sub_fh
,
group_df
=
group_df
,
raw_data_cols
=
raw_data_cols
,
metric_label
=
metric_label
,
y_axis_col
=
y_axis_col
,
args
=
args
,
)
html
=
render_metric_table_html
(
excel_blocks
.
append
(
display_group
,
metric_label
,
suffix
,
args
(
metric_label
,
group_df
.
reset_index
(
drop
=
True
))
)
if
csv_dir
:
fn
=
_safe_filename
(
f
"
{
sheet
}
__
{
metric_label
}
"
.
replace
(
" "
,
"_"
).
replace
(
"/"
,
"_"
)
)
group_df
.
to_csv
(
csv_dir
/
f
"
{
fn
}
.csv"
,
index
=
False
)
summary_html
=
build_valid_max_concurrency_summary_html
(
tput_group_df
=
tput_group_df
,
ttft_group_df
=
ttft_group_df
,
tpot_group_df
=
tpot_group_df
,
conc_col
=
conc_col
,
args
=
args
,
)
)
main_fh
.
write
(
html
)
if
summary_html
:
sub_fh
.
write
(
html
)
main_fh
.
write
(
summary_html
)
sub_fh
.
write
(
summary_html
)
maybe_write_plot
(
main_fh
,
summary_df
=
build_valid_max_concurrency_summary_df
(
sub_fh
,
tput_group_df
=
tput_group_df
,
group_df
=
group_df
,
ttft_group_df
=
ttft_group_df
,
raw_data_cols
=
raw_data_cols
,
tpot_group_df
=
tpot_group_df
,
metric_label
=
metric_label
,
conc_col
=
conc_col
,
y_axis_col
=
y_axis_col
,
args
=
args
,
args
=
args
,
)
)
if
summary_df
is
not
None
:
excel_blocks
.
append
(
(
"Valid Max Concurrency Summary"
,
summary_df
)
)
if
csv_dir
:
fn
=
_safe_filename
(
f
"
{
sheet
}
__Valid_Max_Concurrency_Summary"
)
summary_df
.
to_csv
(
csv_dir
/
f
"
{
fn
}
.csv"
,
index
=
False
)
summary_html
=
build_valid_max_concurrency_summary_html
(
if
do_excel
:
tput_group_df
=
tput_group_df
,
_write_tables_to_excel_sheet
(
xw
,
sheet
,
excel_blocks
)
ttft_group_df
=
ttft_group_df
,
tpot_group_df
=
tpot_group_df
,
if
disable_excel
:
conc_col
=
conc_col
,
print
(
"Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1)."
)
args
=
args
,
else
:
)
print
(
f
"Wrote Excel:
{
excel_path
}
"
)
if
summary_html
:
if
csv_dir
:
main_fh
.
write
(
summary_html
)
print
(
f
"Wrote CSVs under:
{
csv_dir
}
"
)
sub_fh
.
write
(
summary_html
)
def
main
():
def
main
():
...
...
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
100755 → 100644
View file @
3fb4b5fa
#!/bin/bash
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
# Benchmarking results will be available inside vllm/benchmarks/results/
...
@@ -9,14 +7,26 @@
...
@@ -9,14 +7,26 @@
set
-x
set
-x
set
-o
pipefail
set
-o
pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN
=
"
${
DRY_RUN
:-
0
}
"
MODEL_FILTER
=
"
${
MODEL_FILTER
:-}
"
DTYPE_FILTER
=
"
${
DTYPE_FILTER
:-}
"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY
=
"
${
ENABLE_ADAPTIVE_CONCURRENCY
:-
0
}
"
SLA_TTFT_MS
=
"
${
SLA_TTFT_MS
:-
3000
}
"
SLA_TPOT_MS
=
"
${
SLA_TPOT_MS
:-
100
}
"
ADAPTIVE_MAX_PROBES
=
"
${
ADAPTIVE_MAX_PROBES
:-
8
}
"
ADAPTIVE_MAX_CONCURRENCY
=
"
${
ADAPTIVE_MAX_CONCURRENCY
:-
1024
}
"
check_gpus
()
{
check_gpus
()
{
if
command
-v
nvidia-smi
;
then
if
command
-v
nvidia-smi
;
then
# check the number of GPUs and GPU type.
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
grep
-c
.
||
true
)
elif
command
-v
amd-smi
;
then
elif
command
-v
amd-smi
;
then
declare
-g
gpu_count
=
$(
amd-smi list |
grep
'GPU'
|
wc
-l
)
declare
-g
gpu_count
=
$(
amd-smi list |
grep
-c
'GPU'
|
|
true
)
elif
command
-v
hl-smi
;
then
elif
command
-v
hl-smi
;
then
declare
-g
gpu_count
=
$(
hl-smi
--list
|
grep
-i
"Module ID"
|
wc
-l
)
declare
-g
gpu_count
=
$(
hl-smi
--list
|
grep
-
c
i
"Module ID"
|
|
true
)
fi
fi
if
[[
$gpu_count
-gt
0
]]
;
then
if
[[
$gpu_count
-gt
0
]]
;
then
...
@@ -44,7 +54,7 @@ check_cpus() {
...
@@ -44,7 +54,7 @@ check_cpus() {
declare
-g
numa_count
=
$(
lscpu |
grep
"NUMA node(s):"
|
awk
'{print $3}'
)
declare
-g
numa_count
=
$(
lscpu |
grep
"NUMA node(s):"
|
awk
'{print $3}'
)
if
[[
$numa_count
-gt
0
]]
;
then
if
[[
$numa_count
-gt
0
]]
;
then
echo
"NUMA found."
echo
"NUMA found."
echo
$numa_count
echo
"
$numa_count
"
else
else
echo
"Need at least 1 NUMA to run benchmarking."
echo
"Need at least 1 NUMA to run benchmarking."
exit
1
exit
1
...
@@ -112,13 +122,12 @@ json2envs() {
...
@@ -112,13 +122,12 @@ json2envs() {
}
}
wait_for_server
()
{
wait_for_server
()
{
# wait for vllm server to start
# return 1 if vllm server crashes
local
timeout_val
=
"1200"
local
timeout_val
=
"1200"
timeout
"
$timeout_val
"
bash
-c
'
timeout
"
$timeout_val
"
bash
-c
'
until curl -
X POST
localhost:8000/v1/
completions
; do
until curl -
sf http://
localhost:8000/v1/
models >/dev/null
; do
sleep 1
sleep 1
done'
&&
return
0
||
return
1
done
'
}
}
kill_processes_launched_by_current_bash
()
{
kill_processes_launched_by_current_bash
()
{
...
@@ -181,6 +190,304 @@ upload_to_buildkite() {
...
@@ -181,6 +190,304 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
}
}
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving
()
{
local
test_name
=
$1
local
qps
=
$2
local
max_concurrency
=
$3
echo
"
$RESULTS_FOLDER
/
${
test_name
}
_qps_
${
qps
}
_concurrency_
${
max_concurrency
}
.json"
}
extract_metric_ms
()
{
local
metric_name
=
$1
local
json_file
=
$2
[[
-f
"
$json_file
"
]]
||
return
0
if
[[
"
$metric_name
"
==
"ttft"
]]
;
then
jq
-r
'
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
'
"
$json_file
"
else
jq
-r
'
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
'
"
$json_file
"
fi
}
evaluate_sla_from_json
()
{
local
json_file
=
$1
local
ttft
local
tpot
local
pass
[[
-f
"
$json_file
"
]]
||
return
2
ttft
=
$(
extract_metric_ms ttft
"
$json_file
"
)
tpot
=
$(
extract_metric_ms tpot
"
$json_file
"
)
[[
-n
"
$ttft
"
&&
-n
"
$tpot
"
]]
||
return
2
pass
=
$(
jq
-n
\
--argjson
ttft
"
$ttft
"
\
--argjson
tpot
"
$tpot
"
\
--argjson
sla_ttft
"
$SLA_TTFT_MS
"
\
--argjson
sla_tpot
"
$SLA_TPOT_MS
"
\
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)'
)
[[
"
$pass
"
==
"true"
]]
}
write_adaptive_summary_json
()
{
local
summary_file
=
$1
local
test_name
=
$2
local
qps
=
$3
local
static_last_pass
=
$4
local
static_first_fail
=
$5
local
final_last_pass
=
$6
local
final_first_fail
=
$7
jq
-n
\
--arg
test_name
"
$test_name
"
\
--arg
qps
"
$qps
"
\
--argjson
sla_ttft
"
$SLA_TTFT_MS
"
\
--argjson
sla_tpot
"
$SLA_TPOT_MS
"
\
--arg
static_last_pass
"
${
static_last_pass
:-}
"
\
--arg
static_first_fail
"
${
static_first_fail
:-}
"
\
--arg
final_last_pass
"
${
final_last_pass
:-}
"
\
--arg
final_first_fail
"
${
final_first_fail
:-}
"
\
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}'
>
"
$summary_file
"
}
run_single_serving_probe
()
{
local
test_name
=
$1
local
qps
=
$2
local
max_concurrency
=
$3
local
tp
=
$4
local
compilation_config_mode
=
$5
local
optimization_level
=
$6
local
client_args_effective
=
$7
local
client_remote_args
=
$8
local
server_command
=
$9
local
new_test_name
=
"
${
test_name
}
_qps_
${
qps
}
_concurrency_
${
max_concurrency
}
"
local
result_json
local
num_prompts_arg
=
""
local
client_command
result_json
=
$(
result_json_path_for_serving
"
$test_name
"
"
$qps
"
"
$max_concurrency
"
)
if
[[
-f
"
$result_json
"
]]
;
then
evaluate_sla_from_json
"
$result_json
"
return
$?
fi
if
[[
-n
"
${
PROMPTS_PER_CONCURRENCY
}
"
]]
;
then
num_prompts
=
$((
max_concurrency
*
PROMPTS_PER_CONCURRENCY
))
if
((
num_prompts < MIN_NUM_PROMPTS
))
;
then
num_prompts
=
$MIN_NUM_PROMPTS
;
fi
if
((
num_prompts
>
MAX_NUM_PROMPTS
))
;
then
num_prompts
=
$MAX_NUM_PROMPTS
;
fi
num_prompts_arg
=
"--num-prompts
$num_prompts
"
fi
client_command
=
"vllm bench serve
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--max-concurrency
$max_concurrency
\
$num_prompts_arg
\
--metadata tensor_parallel_size=
$tp
compilation_config.mode=
$compilation_config_mode
optimization_level=
$optimization_level
adaptive_search=1
\
$client_args_effective
$client_remote_args
"
echo
"Adaptive probe:
$client_command
"
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$client_command
"
fi
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
evaluate_sla_from_json
"
$result_json
"
}
adaptive_refine_from_static_results
()
{
local
test_name
=
$1
local
qps
=
$2
local
max_concurrency_list_raw
=
$3
local
tp
=
$4
local
compilation_config_mode
=
$5
local
optimization_level
=
$6
local
client_args_effective
=
$7
local
client_remote_args
=
$8
local
server_command
=
$9
local
sorted_points
local
point
local
rc
local
static_last_pass
=
""
local
static_first_fail
=
""
local
largest_static
=
""
local
step_hint
=
1
local
previous_point
=
""
local
low
local
high
local
mid
local
probes
=
0
local
summary_file
=
"
$RESULTS_FOLDER
/
${
test_name
}
_qps_
${
qps
}
_sla_summary.json"
[[
"
${
ENABLE_ADAPTIVE_CONCURRENCY
}
"
==
"1"
]]
||
return
0
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
||
return
0
sorted_points
=
$(
for
point
in
$max_concurrency_list_raw
;
do
printf
'%s\n'
"
$point
"
;
done
|
tr
-d
"'"
|
awk
'/^[0-9]+$/'
|
sort
-n
|
uniq
)
[[
-n
"
$sorted_points
"
]]
||
return
0
while
read
-r
point
;
do
[[
-z
"
$point
"
]]
&&
continue
largest_static
=
"
$point
"
evaluate_sla_from_json
"
$(
result_json_path_for_serving
"
$test_name
"
"
$qps
"
"
$point
"
)
"
rc
=
$?
if
((
rc
==
0
))
;
then
static_last_pass
=
"
$point
"
elif
((
rc
==
1
))
;
then
if
[[
-n
"
$static_last_pass
"
]]
;
then
static_first_fail
=
"
$point
"
break
fi
fi
if
[[
-n
"
$previous_point
"
]]
;
then
step_hint
=
$((
point
-
previous_point
))
if
((
step_hint < 1
))
;
then
step_hint
=
1
;
fi
fi
previous_point
=
"
$point
"
done
<<<
"
$sorted_points
"
if
[[
-z
"
$static_last_pass
"
]]
;
then
write_adaptive_summary_json
"
$summary_file
"
"
$test_name
"
"
$qps
"
""
"
$static_first_fail
"
""
"
$static_first_fail
"
return
0
fi
if
[[
-n
"
$static_first_fail
"
]]
;
then
low
=
$static_last_pass
high
=
$static_first_fail
while
((
low + 1 < high
))
&&
((
probes < ADAPTIVE_MAX_PROBES
))
;
do
mid
=
$((
(
low
+
high
)
/
2
))
probes
=
$((
probes
+
1
))
run_single_serving_probe
\
"
$test_name
"
"
$qps
"
"
$mid
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
rc
=
$?
if
((
rc
==
0
))
;
then
low
=
$mid
elif
((
rc
==
1
))
;
then
high
=
$mid
else
break
fi
done
write_adaptive_summary_json
"
$summary_file
"
"
$test_name
"
"
$qps
"
"
$static_last_pass
"
"
$static_first_fail
"
"
$low
"
"
$high
"
return
0
fi
low
=
$largest_static
high
=
""
while
((
probes < ADAPTIVE_MAX_PROBES
))
;
do
point
=
$((
low
+
step_hint
))
if
((
point
>
ADAPTIVE_MAX_CONCURRENCY
))
;
then
point
=
$ADAPTIVE_MAX_CONCURRENCY
fi
((
point
>
low
))
||
break
probes
=
$((
probes
+
1
))
run_single_serving_probe
\
"
$test_name
"
"
$qps
"
"
$point
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
rc
=
$?
if
((
rc
==
0
))
;
then
low
=
$point
((
point
==
ADAPTIVE_MAX_CONCURRENCY
))
&&
break
step_hint
=
$((
step_hint
*
2
))
if
((
step_hint < 1
))
;
then
step_hint
=
1
;
fi
elif
((
rc
==
1
))
;
then
high
=
$point
break
else
break
fi
done
if
[[
-n
"
$high
"
]]
;
then
while
((
low + 1 < high
))
&&
((
probes < ADAPTIVE_MAX_PROBES
))
;
do
mid
=
$((
(
low
+
high
)
/
2
))
probes
=
$((
probes
+
1
))
run_single_serving_probe
\
"
$test_name
"
"
$qps
"
"
$mid
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
rc
=
$?
if
((
rc
==
0
))
;
then
low
=
$mid
elif
((
rc
==
1
))
;
then
high
=
$mid
else
break
fi
done
fi
write_adaptive_summary_json
"
$summary_file
"
"
$test_name
"
"
$qps
"
"
$static_last_pass
"
""
"
$low
"
"
$high
"
}
run_benchmark_tests
()
{
run_benchmark_tests
()
{
# run benchmark tests using `vllm bench <test_type>` command
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
# $1: test type (latency or throughput)
...
@@ -252,37 +559,16 @@ run_benchmark_tests() {
...
@@ -252,37 +559,16 @@ run_benchmark_tests() {
done
done
}
}
run_latency_tests
()
{
run_latency_tests
()
{
run_benchmark_tests
"latency"
"
$1
"
;
}
run_benchmark_tests
"latency"
"
$1
"
run_startup_tests
()
{
run_benchmark_tests
"startup"
"
$1
"
;
}
}
run_throughput_tests
()
{
run_benchmark_tests
"throughput"
"
$1
"
;
}
run_startup_tests
()
{
run_benchmark_tests
"startup"
"
$1
"
}
run_throughput_tests
()
{
run_benchmark_tests
"throughput"
"
$1
"
}
run_serving_tests
()
{
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local
serving_test_file
merge_serving_tests_stream
()
{
serving_test_file
=
$1
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
# Iterate over serving tests
local
serving_test_file
=
"
$1
"
jq
-c
'
# shellcheck disable=SC2016
local
merged
=
'
if type == "array" then
if type == "array" then
# Plain format: test cases array
# Plain format: test cases array
.[]
.[]
...
@@ -304,7 +590,50 @@ run_serving_tests() {
...
@@ -304,7 +590,50 @@ run_serving_tests() {
else
else
error("Unsupported serving test file format: must be array or object with .tests")
error("Unsupported serving test file format: must be array or object with .tests")
end
end
'
"
$serving_test_file
"
|
while
read
-r
params
;
do
'
jq
-c
"
$merged
"
"
$serving_test_file
"
|
\
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
&&
(
"
${
MODEL_FILTER
}${
DTYPE_FILTER
}
"
!=
""
)
]]
;
then
jq
-c
--arg
model
"
$MODEL_FILTER
"
--arg
dtype
"
$DTYPE_FILTER
"
'
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests
()
{
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local
serving_test_file
serving_test_file
=
$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
&&
(
"
${
MODEL_FILTER
}${
DTYPE_FILTER
}
"
!=
""
)
]]
;
then
local
count
count
=
$(
merge_serving_tests_stream
"
$serving_test_file
"
|
wc
-l
|
tr
-d
' '
)
if
[[
"
$count
"
-eq
0
]]
;
then
echo
"No matching serving tests found in
$serving_test_file
for model='
$MODEL_FILTER
' dtype='
$DTYPE_FILTER
'."
>
&2
return
0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
if
[[
!
"
$test_name
"
=
~ ^serving_
]]
;
then
if
[[
!
"
$test_name
"
=
~ ^serving_
]]
;
then
...
@@ -323,10 +652,48 @@ run_serving_tests() {
...
@@ -323,10 +652,48 @@ run_serving_tests() {
server_envs
=
$(
echo
"
$params
"
| jq
-r
'.server_environment_variables'
)
server_envs
=
$(
echo
"
$params
"
| jq
-r
'.server_environment_variables'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.client_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model
=
$(
echo
"
$server_params
"
| jq
-r
'.model // empty'
)
if
[[
-z
"
$server_model
"
||
"
$server_model
"
==
"null"
]]
;
then
echo
"Error: serving test '
$test_name
' is missing server_parameters.model"
>
&2
exit
1
fi
server_params_no_model
=
$(
echo
"
$server_params
"
| jq
-c
'del(.model)'
)
server_args
=
$(
json2args
"
$server_params_no_model
"
)
server_envs
=
$(
json2envs
"
$server_envs
"
)
server_envs
=
$(
json2envs
"
$server_envs
"
)
client_args
=
$(
json2args
"
$client_params
"
)
client_args
=
$(
json2args
"
$client_params
"
)
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY
=
"
${
PROMPTS_PER_CONCURRENCY
-
}
"
# no default on purpose
MIN_NUM_PROMPTS
=
"
${
MIN_NUM_PROMPTS
:-
1
}
"
MAX_NUM_PROMPTS
=
"
${
MAX_NUM_PROMPTS
:-
1000000
}
"
if
[[
-n
"
${
PROMPTS_PER_CONCURRENCY
}
"
]]
;
then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np
=
"
$(
printf
' %s '
"
$client_args
"
\
|
sed
-E
\
-e
's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g'
\
-e
's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)
"
# normalize whitespace
client_args_no_np
=
"
$(
echo
"
$client_args_no_np
"
|
tr
-s
' '
|
sed
-E
's/^ //; s/ $//'
)
"
client_args_no_np
=
"
$(
echo
"
$client_args_no_np
"
| xargs
)
"
client_args_effective
=
"
$client_args_no_np
"
else
client_args_effective
=
"
$client_args
"
fi
# qps_list
# qps_list
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
...
@@ -358,14 +725,13 @@ run_serving_tests() {
...
@@ -358,14 +725,13 @@ run_serving_tests() {
fi
fi
# check if server model and client model is aligned
# check if server model and client model is aligned
server_model
=
$(
echo
"
$server_params
"
| jq
-r
'.model'
)
client_model
=
$(
echo
"
$client_params
"
| jq
-r
'.model'
)
client_model
=
$(
echo
"
$client_params
"
| jq
-r
'.model'
)
if
[[
$server_model
!=
"
$client_model
"
]]
;
then
if
[[
$server_model
!=
"
$client_model
"
]]
;
then
echo
"Server model and client model must be the same. Skip testcase
$test_name
."
echo
"Server model and client model must be the same. Skip testcase
$test_name
."
continue
continue
fi
fi
server_command
=
"
$server_envs
vllm serve
\
server_command
=
"
$server_envs
vllm serve
$server_model
\
$server_args
"
$server_args
"
# run the server
# run the server
...
@@ -373,7 +739,7 @@ run_serving_tests() {
...
@@ -373,7 +739,7 @@ run_serving_tests() {
echo
"Server command:
$server_command
"
echo
"Server command:
$server_command
"
# support remote vllm server
# support remote vllm server
client_remote_args
=
""
client_remote_args
=
""
if
[[
-z
"
${
REMOTE_HOST
}
"
]]
;
then
if
[[
-z
"
${
REMOTE_HOST
}
"
&&
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$server_command
"
&
bash
-c
"
$server_command
"
&
server_pid
=
$!
server_pid
=
$!
# wait until the server is alive
# wait until the server is alive
...
@@ -384,6 +750,9 @@ run_serving_tests() {
...
@@ -384,6 +750,9 @@ run_serving_tests() {
echo
""
echo
""
echo
"vLLM failed to start within the timeout period."
echo
"vLLM failed to start within the timeout period."
fi
fi
elif
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
]]
;
then
# dry-run: don't start server
echo
"Dry Run."
else
else
server_command
=
"Using Remote Server
$REMOTE_HOST
$REMOTE_PORT
"
server_command
=
"Using Remote Server
$REMOTE_HOST
$REMOTE_PORT
"
if
[[
${
REMOTE_PORT
}
]]
;
then
if
[[
${
REMOTE_PORT
}
]]
;
then
...
@@ -402,15 +771,21 @@ run_serving_tests() {
...
@@ -402,15 +771,21 @@ run_serving_tests() {
for
qps
in
$qps_list
;
do
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
fi
# iterate over different max_concurrency
# iterate over different max_concurrency
for
max_concurrency
in
$max_concurrency_list
;
do
for
max_concurrency
in
$max_concurrency_list
;
do
new_test_name
=
$
test_name
"
_qps_
"
$qps
"
_concurrency_
"
$max_concurrency
new_test_name
=
"
${
test_name
}
_qps_
$
{
qps
}
_concurrency_
$
{
max_concurrency
}
"
echo
" new test name
$new_test_name
"
echo
" new test name
$new_test_name
"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg
=
""
if
[[
-n
"
${
PROMPTS_PER_CONCURRENCY
}
"
]]
;
then
num_prompts
=
$((
max_concurrency
*
PROMPTS_PER_CONCURRENCY
))
if
((
num_prompts < MIN_NUM_PROMPTS
))
;
then
num_prompts
=
$MIN_NUM_PROMPTS
;
fi
if
((
num_prompts
>
MAX_NUM_PROMPTS
))
;
then
num_prompts
=
$MAX_NUM_PROMPTS
;
fi
num_prompts_arg
=
"--num-prompts
$num_prompts
"
fi
# pass the tensor parallel size, the compilation mode, and the optimization
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
# level to the client so that they can be used on the benchmark dashboard
client_command
=
"vllm bench serve
\
client_command
=
"vllm bench serve
\
...
@@ -419,13 +794,16 @@ run_serving_tests() {
...
@@ -419,13 +794,16 @@ run_serving_tests() {
--result-filename
${
new_test_name
}
.json
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--request-rate
$qps
\
--max-concurrency
$max_concurrency
\
--max-concurrency
$max_concurrency
\
$num_prompts_arg
\
--metadata tensor_parallel_size=
$tp
compilation_config.mode=
$compilation_config_mode
optimization_level=
$optimization_level
\
--metadata tensor_parallel_size=
$tp
compilation_config.mode=
$compilation_config_mode
optimization_level=
$optimization_level
\
$client_args
$client_remote_args
"
$client_args
_effective
$client_remote_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
echo
"Client command:
$client_command
"
bash
-c
"
$client_command
"
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$client_command
"
fi
# record the benchmarking commands
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
jq_output
=
$(
jq
-n
\
...
@@ -440,15 +818,23 @@ run_serving_tests() {
...
@@ -440,15 +818,23 @@ run_serving_tests() {
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
done
adaptive_refine_from_static_results
\
"
$test_name
"
"
$qps
"
"
$max_concurrency_list
"
"
$tp
"
\
"
$compilation_config_mode
"
"
$optimization_level
"
\
"
$client_args_effective
"
"
$client_remote_args
"
"
$server_command
"
done
done
# clean up
# clean up
kill
-9
$server_pid
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
kill_gpu_processes
kill
-9
"
$server_pid
"
kill_gpu_processes
fi
done
done
}
}
main
()
{
main
()
{
local
ARCH
local
ARCH
ARCH
=
''
ARCH
=
''
if
[[
"
$ON_CPU
"
==
"1"
]]
;
then
if
[[
"
$ON_CPU
"
==
"1"
]]
;
then
...
@@ -458,7 +844,13 @@ main() {
...
@@ -458,7 +844,13 @@ main() {
check_gpus
check_gpus
ARCH
=
"
$arch_suffix
"
ARCH
=
"
$arch_suffix
"
fi
fi
check_hf_token
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
check_hf_token
else
echo
"DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies
# dependencies
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
...
@@ -479,11 +871,16 @@ main() {
...
@@ -479,11 +871,16 @@ main() {
# dump vllm info via vllm collect-env
# dump vllm info via vllm collect-env
env_output
=
$(
vllm collect-env
)
env_output
=
$(
vllm collect-env
)
echo
"
$env_output
"
>
"
$RESULTS_FOLDER
/vllm_env.txt"
echo
"
$env_output
"
>
"
$RESULTS_FOLDER
/vllm_env.txt"
# benchmarking
# benchmarking
run_serving_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
SERVING_JSON
:-
serving
-tests
$ARCH
.json
}
"
run_serving_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
SERVING_JSON
:-
serving
-tests
$ARCH
.json
}
"
||
exit
$?
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
]]
;
then
echo
"DRY_RUN=1 -> skip latency/startup/throughput suites"
exit
0
fi
run_latency_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
LATENCY_JSON
:-
latency
-tests
$ARCH
.json
}
"
run_latency_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
LATENCY_JSON
:-
latency
-tests
$ARCH
.json
}
"
run_startup_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
STARTUP_JSON
:-
startup
-tests
$ARCH
.json
}
"
run_startup_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
STARTUP_JSON
:-
startup
-tests
$ARCH
.json
}
"
run_throughput_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
THROUGHPUT_JSON
:-
throughput
-tests
$ARCH
.json
}
"
run_throughput_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
THROUGHPUT_JSON
:-
throughput
-tests
$ARCH
.json
}
"
...
@@ -491,6 +888,7 @@ main() {
...
@@ -491,6 +888,7 @@ main() {
# postprocess benchmarking results
# postprocess benchmarking results
pip
install
tabulate pandas
pip
install
tabulate pandas
python3
$QUICK_BENCHMARK_ROOT
/scripts/convert-results-json-to-markdown.py
python3
$QUICK_BENCHMARK_ROOT
/scripts/convert-results-json-to-markdown.py
python3
$QUICK_BENCHMARK_ROOT
/scripts/compare-json-results.py
-f
$RESULTS_FOLDER
/benchmark_results.json
upload_to_buildkite
upload_to_buildkite
}
}
...
...
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
View file @
3fb4b5fa
...
@@ -51,5 +51,56 @@
...
@@ -51,5 +51,56 @@
"max-model-len"
:
256
,
"max-model-len"
:
256
,
"async-scheduling"
:
""
"async-scheduling"
:
""
}
}
},
{
"test_name"
:
"latency_deepseek_r1"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"deepseek-ai/DeepSeek-R1"
,
"tensor_parallel_size"
:
8
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
"dtype"
:
"bfloat16"
}
},
{
"test_name"
:
"latency_llama4_maverick_17b128e_instruct_fp8"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
"tensor_parallel_size"
:
8
,
"max-model-len"
:
512
,
"max-num-seqs"
:
128
,
"async-scheduling"
:
""
,
"gpu-memory-utilization"
:
0.95
,
"enable_expert_parallel"
:
""
}
},
{
"test_name"
:
"latency_qwen3_8b"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
128
,
"dtype"
:
"bfloat16"
,
"async-scheduling"
:
""
}
}
}
]
]
.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
},
"server_parameters"
:
{
"dtype"
:
"bfloat16"
,
"model"
:
"openai/whisper-large-v3-turbo"
},
"client_parameters"
:
{
"model"
:
"openai/whisper-large-v3-turbo"
,
"backend"
:
"openai-audio"
,
"endpoint"
:
"/v1/audio/transcriptions"
,
"dataset_name"
:
"hf"
,
"dataset_path"
:
"openslr/librispeech_asr"
,
"hf_subset"
:
"clean"
,
"hf_split"
:
"test"
,
"no_stream"
:
""
,
"no_oversample"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_whisper_large_v3_turbo_librispeech_clean_tp1"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
32
,
64
,
128
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"dtype"
:
"bfloat16"
,
"model"
:
"jinaai/jina-embeddings-v3"
,
"trust_remote_code"
:
""
},
"client_parameters"
:
{
"model"
:
"jinaai/jina-embeddings-v3"
,
"backend"
:
"openai-embeddings"
,
"endpoint"
:
"/v1/embeddings"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_jina_embed_v3_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"backend"
:
"vllm"
,
"ignore-eos"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp2_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp2_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp2_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama3B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_granite2B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen1.7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen4B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen8B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_glm9B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_gemma7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
View file @
3fb4b5fa
...
@@ -72,17 +72,6 @@
...
@@ -72,17 +72,6 @@
"random-output-len"
:
128
"random-output-len"
:
128
}
}
},
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
"server_parameters"
:
{
...
@@ -105,17 +94,6 @@
...
@@ -105,17 +94,6 @@
"random-output-len"
:
2048
"random-output-len"
:
2048
}
}
},
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
"server_parameters"
:
{
...
@@ -139,144 +117,25 @@
...
@@ -139,144 +117,25 @@
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_tp4_random_2048_128"
,
"test_name"
:
"serving_llama8B_tp1_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
1
"tensor_parallel_size"
:
1
},
},
"client_parameters"
:
{
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
1
28
,
"random-input-len"
:
2
04
8
,
"random-output-len"
:
1
28
"random-output-len"
:
2
04
8
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_
int4_
tp2_random_
128_12
8"
,
"test_name"
:
"serving_llama8B_tp2_random_
2048_204
8"
,
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
2
"tensor_parallel_size"
:
2
},
},
"client_parameters"
:
{
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama3B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_granite2B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"dataset_name"
:
"random"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_qwen1.7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen4B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen8B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_glm9B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_gemma7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
}
}
}
]
]
...
...
Prev
1
2
3
4
5
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment