Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
1000
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1630 additions
and
374 deletions
+1630
-374
.buildkite/hardware_tests/amd.yaml
.buildkite/hardware_tests/amd.yaml
+3
-2
.buildkite/hardware_tests/cpu.yaml
.buildkite/hardware_tests/cpu.yaml
+14
-0
.buildkite/image_build/image_build.sh
.buildkite/image_build/image_build.sh
+11
-12
.buildkite/image_build/image_build.yaml
.buildkite/image_build/image_build.yaml
+1
-2
.buildkite/image_build/image_build_cpu.sh
.buildkite/image_build/image_build_cpu.sh
+6
-8
.buildkite/image_build/image_build_cpu_arm64.sh
.buildkite/image_build/image_build_cpu_arm64.sh
+5
-5
.buildkite/image_build/image_build_hpu.sh
.buildkite/image_build/image_build_hpu.sh
+5
-5
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
.../lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+2
-2
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+1
-1
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+1
-1
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
...kite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+2
-5
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+8
-2
.buildkite/performance-benchmarks/README.md
.buildkite/performance-benchmarks/README.md
+0
-1
.buildkite/performance-benchmarks/scripts/compare-json-results.py
...te/performance-benchmarks/scripts/compare-json-results.py
+628
-126
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
...formance-benchmarks/scripts/run-performance-benchmarks.sh
+453
-55
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
...dkite/performance-benchmarks/tests/latency-tests-hpu.json
+51
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
...e/performance-benchmarks/tests/serving-tests-cpu-asr.json
+37
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
...performance-benchmarks/tests/serving-tests-cpu-embed.json
+41
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
.../performance-benchmarks/tests/serving-tests-cpu-text.json
+355
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
...dkite/performance-benchmarks/tests/serving-tests-cpu.json
+6
-147
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
.buildkite/hardware_tests/amd.yaml
View file @
3fb4b5fa
group
:
Hardware
group
:
Hardware
- AMD Build
steps
:
steps
:
-
label
:
"
AMD:
:docker:
build
image"
-
label
:
"
AMD:
:docker:
build
image"
key
:
image-build-amd
depends_on
:
[]
depends_on
:
[]
device
:
amd_cpu
device
:
amd_cpu
no_plugin
:
true
no_plugin
:
true
...
@@ -9,7 +10,7 @@ steps:
...
@@ -9,7 +10,7 @@ steps:
docker build
docker build
--build-arg max_jobs=16
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942
;gfx950
'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
-f docker/Dockerfile.rocm
...
...
.buildkite/hardware_tests/cpu.yaml
View file @
3fb4b5fa
...
@@ -21,6 +21,20 @@ steps:
...
@@ -21,6 +21,20 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
pytest -x -v -s tests/kernels/test_onednn.py"
-
label
:
CPU-Compatibility Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
-
cmake/cpu_extension.cmake
-
setup.py
-
vllm/platforms/cpu.py
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
-
label
:
CPU-Language Generation and Pooling Model Tests
-
label
:
CPU-Language Generation and Pooling Model Tests
depends_on
:
[]
depends_on
:
[]
soft_fail
:
true
soft_fail
:
true
...
...
.buildkite/image_build/image_build.sh
View file @
3fb4b5fa
...
@@ -8,7 +8,7 @@ clean_docker_tag() {
...
@@ -8,7 +8,7 @@ clean_docker_tag() {
}
}
print_usage_and_exit
()
{
print_usage_and_exit
()
{
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <
vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>
"
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <
image_tag> [<image_tag_latest>]
"
exit
1
exit
1
}
}
...
@@ -142,11 +142,16 @@ resolve_parent_commit() {
...
@@ -142,11 +142,16 @@ resolve_parent_commit() {
print_bake_config
()
{
print_bake_config
()
{
echo
"--- :page_facing_up: Resolved bake configuration"
echo
"--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE
=
"bake-config-build-
${
BUILDKITE_BUILD_NUMBER
:-
local
}
.json"
# Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local
bake_tmp
bake_tmp
=
"
$(
mktemp
-d
)
"
BAKE_CONFIG_FILE
=
"
${
bake_tmp
}
/bake-config-build-
${
BUILDKITE_BUILD_NUMBER
:-
local
}
.json"
docker buildx bake
-f
"
${
VLLM_BAKE_FILE_PATH
}
"
-f
"
${
CI_HCL_PATH
}
"
--print
"
${
TARGET
}
"
|
tee
"
${
BAKE_CONFIG_FILE
}
"
||
true
docker buildx bake
-f
"
${
VLLM_BAKE_FILE_PATH
}
"
-f
"
${
CI_HCL_PATH
}
"
--print
"
${
TARGET
}
"
|
tee
"
${
BAKE_CONFIG_FILE
}
"
||
true
echo
"Saved bake config to
${
BAKE_CONFIG_FILE
}
"
echo
"Saved bake config to
${
BAKE_CONFIG_FILE
}
"
echo
"--- :arrow_down: Uploading bake config to Buildkite"
echo
"--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload
"
${
BAKE_CONFIG_FILE
}
"
(
cd
"
$(
dirname
"
${
BAKE_CONFIG_FILE
}
"
)
"
&&
buildkite-agent artifact upload
"
$(
basename
"
${
BAKE_CONFIG_FILE
}
"
)
"
)
}
}
#################################
#################################
...
@@ -154,7 +159,7 @@ print_bake_config() {
...
@@ -154,7 +159,7 @@ print_bake_config() {
#################################
#################################
print_instance_info
print_instance_info
if
[[
$#
-lt
7
]]
;
then
if
[[
$#
-lt
5
]]
;
then
print_usage_and_exit
print_usage_and_exit
fi
fi
...
@@ -163,10 +168,8 @@ REGISTRY=$1
...
@@ -163,10 +168,8 @@ REGISTRY=$1
REPO
=
$2
REPO
=
$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
BRANCH
=
$4
BRANCH
=
$4
VLLM_USE_PRECOMPILED
=
$5
IMAGE_TAG
=
$5
VLLM_MERGE_BASE_COMMIT
=
$6
IMAGE_TAG_LATEST
=
${
6
:-}
# only used for main branch, optional
IMAGE_TAG
=
$7
IMAGE_TAG_LATEST
=
${
8
:-}
# only used for main branch, optional
# build config
# build config
TARGET
=
"test-ci"
TARGET
=
"test-ci"
...
@@ -193,8 +196,6 @@ export CACHE_FROM
...
@@ -193,8 +196,6 @@ export CACHE_FROM
export
CACHE_FROM_BASE_BRANCH
export
CACHE_FROM_BASE_BRANCH
export
CACHE_FROM_MAIN
export
CACHE_FROM_MAIN
export
CACHE_TO
export
CACHE_TO
export
VLLM_USE_PRECOMPILED
export
VLLM_MERGE_BASE_COMMIT
# print args
# print args
echo
"--- :mag: Arguments"
echo
"--- :mag: Arguments"
...
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
...
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
echo
"REPO:
${
REPO
}
"
echo
"REPO:
${
REPO
}
"
echo
"BUILDKITE_COMMIT:
${
BUILDKITE_COMMIT
}
"
echo
"BUILDKITE_COMMIT:
${
BUILDKITE_COMMIT
}
"
echo
"BRANCH:
${
BRANCH
}
"
echo
"BRANCH:
${
BRANCH
}
"
echo
"VLLM_USE_PRECOMPILED:
${
VLLM_USE_PRECOMPILED
}
"
echo
"VLLM_MERGE_BASE_COMMIT:
${
VLLM_MERGE_BASE_COMMIT
}
"
echo
"IMAGE_TAG:
${
IMAGE_TAG
}
"
echo
"IMAGE_TAG:
${
IMAGE_TAG
}
"
echo
"IMAGE_TAG_LATEST:
${
IMAGE_TAG_LATEST
}
"
echo
"IMAGE_TAG_LATEST:
${
IMAGE_TAG_LATEST
}
"
...
...
.buildkite/image_build/image_build.yaml
View file @
3fb4b5fa
...
@@ -5,8 +5,7 @@ steps:
...
@@ -5,8 +5,7 @@ steps:
depends_on
:
[]
depends_on
:
[]
timeout_in_minutes
:
600
timeout_in_minutes
:
600
commands
:
commands
:
-
if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
-
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
-
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry
:
retry
:
automatic
:
automatic
:
-
exit_status
:
-1
# Agent was lost
-
exit_status
:
-1
# Agent was lost
...
...
.buildkite/image_build/image_build_cpu.sh
View file @
3fb4b5fa
...
@@ -11,10 +11,10 @@ REPO=$2
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
echo
"Image not found, proceeding with build..."
else
else
echo
"Image found"
echo
"Image found"
...
@@ -24,13 +24,11 @@ fi
...
@@ -24,13 +24,11 @@ fi
# build
# build
docker build
--file
docker/Dockerfile.cpu
\
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--build-arg
VLLM_CPU_AVX512BF16
=
true
\
--build-arg
VLLM_CPU_X86
=
true
\
--build-arg
VLLM_CPU_AVX512VNNI
=
true
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
\
--build-arg
VLLM_CPU_AMXBF16
=
true
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--target
vllm-test
\
--target
vllm-test
\
--progress
plain
.
--progress
plain
.
# push
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-cpu
.buildkite/image_build/image_build_cpu_arm64.sh
View file @
3fb4b5fa
...
@@ -11,10 +11,10 @@ REPO=$2
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
echo
"Image not found, proceeding with build..."
else
else
echo
"Image found"
echo
"Image found"
...
@@ -24,10 +24,10 @@ fi
...
@@ -24,10 +24,10 @@ fi
# build
# build
docker build
--file
docker/Dockerfile.cpu
\
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
\
--target
vllm-test
\
--target
vllm-test
\
--progress
plain
.
--progress
plain
.
# push
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-arm64
-cpu
.buildkite/image_build/image_build_hpu.sh
View file @
3fb4b5fa
...
@@ -11,10 +11,10 @@ REPO=$2
...
@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT
=
$3
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
"
$REGISTRY
"
# skip build if image already exists
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
)
]]
;
then
if
[[
-z
$(
docker manifest inspect
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
echo
"Image not found, proceeding with build..."
else
else
echo
"Image found"
echo
"Image found"
...
@@ -25,10 +25,10 @@ fi
...
@@ -25,10 +25,10 @@ fi
docker build
\
docker build
\
--file
tests/pytorch_ci_hud_benchmark/Dockerfile.hpu
\
--file
tests/pytorch_ci_hud_benchmark/Dockerfile.hpu
\
--build-arg
max_jobs
=
16
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
buildkite_commit
=
"
$BUILDKITE_COMMIT
"
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
\
--tag
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
\
--progress
plain
\
--progress
plain
\
https://github.com/vllm-project/vllm-gaudi.git
https://github.com/vllm-project/vllm-gaudi.git
# push
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
docker push
"
$REGISTRY
"
/
"
$REPO
"
:
"
$BUILDKITE_COMMIT
"
-hpu
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
View file @
3fb4b5fa
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
...
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks
chartqa
\
--tasks
chartqa
\
--batch_size
auto
\
--batch_size
auto
\
--apply_chat_template
\
--apply_chat_template
\
--limit
$LIMIT
--limit
"
$LIMIT
"
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
View file @
3fb4b5fa
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
# We can use this script to compute baseline accuracy on GSM for transformers.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
...
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
View file @
3fb4b5fa
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
# We use this for fp8, which HF does not support.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
...
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
View file @
3fb4b5fa
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
# We use this for fp8, which HF does not support.
#
#
# Make sure you have lm-eval-harness installed:
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.
9.2
"
# pip install "lm-eval[api]>=0.4.
11
"
usage
()
{
usage
()
{
echo
``
echo
``
...
@@ -20,14 +20,11 @@ usage() {
...
@@ -20,14 +20,11 @@ usage() {
echo
echo
}
}
while
getopts
"m:
b:
l:f:t:"
OPT
;
do
while
getopts
"m:l:f:t:"
OPT
;
do
case
${
OPT
}
in
case
${
OPT
}
in
m
)
m
)
MODEL
=
"
$OPTARG
"
MODEL
=
"
$OPTARG
"
;;
;;
b
)
BATCH_SIZE
=
"
$OPTARG
"
;;
l
)
l
)
LIMIT
=
"
$OPTARG
"
LIMIT
=
"
$OPTARG
"
;;
;;
...
...
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
3fb4b5fa
...
@@ -13,9 +13,10 @@ import os
...
@@ -13,9 +13,10 @@ import os
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
import
lm_eval
import
lm_eval
import
numpy
as
np
import
yaml
import
yaml
from
vllm.platforms
import
current_platform
DEFAULT_RTOL
=
0.08
DEFAULT_RTOL
=
0.08
...
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
...
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True,"
"allow_deprecated_quantization=True,"
)
)
if
current_platform
.
is_rocm
()
and
"Nemotron-3"
in
eval_config
[
"model_name"
]:
model_args
+=
"attention_backend=TRITON_ATTN"
env_vars
=
eval_config
.
get
(
"env_vars"
,
None
)
env_vars
=
eval_config
.
get
(
"env_vars"
,
None
)
with
scoped_env_vars
(
env_vars
):
with
scoped_env_vars
(
env_vars
):
results
=
lm_eval
.
simple_evaluate
(
results
=
lm_eval
.
simple_evaluate
(
...
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
...
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f
"ground_truth=
{
ground_truth
:.
3
f
}
| "
f
"ground_truth=
{
ground_truth
:.
3
f
}
| "
f
"measured=
{
measured_value
:.
3
f
}
| rtol=
{
rtol
}
"
f
"measured=
{
measured_value
:.
3
f
}
| rtol=
{
rtol
}
"
)
)
success
=
success
and
np
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
rtol
)
min_acceptable
=
ground_truth
*
(
1
-
rtol
)
success
=
success
and
measured_value
>=
min_acceptable
assert
success
assert
success
.buildkite/performance-benchmarks/README.md
View file @
3fb4b5fa
...
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
...
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"tensor_parallel_size"
:
1
,
"tensor_parallel_size"
:
1
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"disable_log_stats"
:
""
,
"load_format"
:
"dummy"
"load_format"
:
"dummy"
},
},
...
...
.buildkite/performance-benchmarks/scripts/compare-json-results.py
View file @
3fb4b5fa
This diff is collapsed.
Click to expand it.
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
100755 → 100644
View file @
3fb4b5fa
This diff is collapsed.
Click to expand it.
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
View file @
3fb4b5fa
...
@@ -51,5 +51,56 @@
...
@@ -51,5 +51,56 @@
"max-model-len"
:
256
,
"max-model-len"
:
256
,
"async-scheduling"
:
""
"async-scheduling"
:
""
}
}
},
{
"test_name"
:
"latency_deepseek_r1"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"deepseek-ai/DeepSeek-R1"
,
"tensor_parallel_size"
:
8
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
"dtype"
:
"bfloat16"
}
},
{
"test_name"
:
"latency_llama4_maverick_17b128e_instruct_fp8"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
"tensor_parallel_size"
:
8
,
"max-model-len"
:
512
,
"max-num-seqs"
:
128
,
"async-scheduling"
:
""
,
"gpu-memory-utilization"
:
0.95
,
"enable_expert_parallel"
:
""
}
},
{
"test_name"
:
"latency_qwen3_8b"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
128
,
"dtype"
:
"bfloat16"
,
"async-scheduling"
:
""
}
}
}
]
]
.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
},
"server_parameters"
:
{
"dtype"
:
"bfloat16"
,
"model"
:
"openai/whisper-large-v3-turbo"
},
"client_parameters"
:
{
"model"
:
"openai/whisper-large-v3-turbo"
,
"backend"
:
"openai-audio"
,
"endpoint"
:
"/v1/audio/transcriptions"
,
"dataset_name"
:
"hf"
,
"dataset_path"
:
"openslr/librispeech_asr"
,
"hf_subset"
:
"clean"
,
"hf_split"
:
"test"
,
"no_stream"
:
""
,
"no_oversample"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_whisper_large_v3_turbo_librispeech_clean_tp1"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
32
,
64
,
128
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"dtype"
:
"bfloat16"
,
"model"
:
"jinaai/jina-embeddings-v3"
,
"trust_remote_code"
:
""
},
"client_parameters"
:
{
"model"
:
"jinaai/jina-embeddings-v3"
,
"backend"
:
"openai-embeddings"
,
"endpoint"
:
"/v1/embeddings"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_jina_embed_v3_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
0 → 100644
View file @
3fb4b5fa
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"backend"
:
"vllm"
,
"ignore-eos"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp2_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp2_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp2_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int8_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama3B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_granite2B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen1.7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen4B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen8B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_glm9B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_gemma7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
View file @
3fb4b5fa
...
@@ -72,17 +72,6 @@
...
@@ -72,17 +72,6 @@
"random-output-len"
:
128
"random-output-len"
:
128
}
}
},
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
"server_parameters"
:
{
...
@@ -105,17 +94,6 @@
...
@@ -105,17 +94,6 @@
"random-output-len"
:
2048
"random-output-len"
:
2048
}
}
},
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
"server_parameters"
:
{
...
@@ -139,144 +117,25 @@
...
@@ -139,144 +117,25 @@
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_tp4_random_2048_128"
,
"test_name"
:
"serving_llama8B_tp1_random_2048_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
1
"tensor_parallel_size"
:
1
},
},
"client_parameters"
:
{
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
1
28
,
"random-input-len"
:
2
04
8
,
"random-output-len"
:
1
28
"random-output-len"
:
2
04
8
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_
int4_
tp2_random_
128_12
8"
,
"test_name"
:
"serving_llama8B_tp2_random_
2048_204
8"
,
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
2
"tensor_parallel_size"
:
2
},
},
"client_parameters"
:
{
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama3B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_granite2B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"dataset_name"
:
"random"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_qwen1.7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen4B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen8B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_glm9B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_gemma7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
}
}
}
]
]
...
...
Prev
1
2
3
4
5
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment