Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
group: Hardware group: Hardware - AMD Build
steps: steps:
- label: "AMD: :docker: build image" - label: "AMD: :docker: build image"
key: image-build-amd
depends_on: [] depends_on: []
device: amd_cpu device: amd_cpu
no_plugin: true no_plugin: true
...@@ -9,7 +10,7 @@ steps: ...@@ -9,7 +10,7 @@ steps:
docker build docker build
--build-arg max_jobs=16 --build-arg max_jobs=16
--build-arg REMOTE_VLLM=1 --build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm -f docker/Dockerfile.rocm
......
...@@ -21,6 +21,20 @@ steps: ...@@ -21,6 +21,20 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py" pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Compatibility Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- cmake/cpu_extension.cmake
- setup.py
- vllm/platforms/cpu.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
- label: CPU-Language Generation and Pooling Model Tests - label: CPU-Language Generation and Pooling Model Tests
depends_on: [] depends_on: []
soft_fail: true soft_fail: true
......
...@@ -8,7 +8,7 @@ clean_docker_tag() { ...@@ -8,7 +8,7 @@ clean_docker_tag() {
} }
print_usage_and_exit() { print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>" echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
exit 1 exit 1
} }
...@@ -142,11 +142,16 @@ resolve_parent_commit() { ...@@ -142,11 +142,16 @@ resolve_parent_commit() {
print_bake_config() { print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration" echo "--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" # Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local bake_tmp
bake_tmp="$(mktemp -d)"
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}" echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite" echo "--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
} }
################################# #################################
...@@ -154,7 +159,7 @@ print_bake_config() { ...@@ -154,7 +159,7 @@ print_bake_config() {
################################# #################################
print_instance_info print_instance_info
if [[ $# -lt 7 ]]; then if [[ $# -lt 5 ]]; then
print_usage_and_exit print_usage_and_exit
fi fi
...@@ -163,10 +168,8 @@ REGISTRY=$1 ...@@ -163,10 +168,8 @@ REGISTRY=$1
REPO=$2 REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
BRANCH=$4 BRANCH=$4
VLLM_USE_PRECOMPILED=$5 IMAGE_TAG=$5
VLLM_MERGE_BASE_COMMIT=$6 IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
IMAGE_TAG=$7
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
# build config # build config
TARGET="test-ci" TARGET="test-ci"
...@@ -193,8 +196,6 @@ export CACHE_FROM ...@@ -193,8 +196,6 @@ export CACHE_FROM
export CACHE_FROM_BASE_BRANCH export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN export CACHE_FROM_MAIN
export CACHE_TO export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args # print args
echo "--- :mag: Arguments" echo "--- :mag: Arguments"
...@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}" ...@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}" echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}" echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}" echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}" echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}" echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
......
...@@ -5,8 +5,7 @@ steps: ...@@ -5,8 +5,7 @@ steps:
depends_on: [] depends_on: []
timeout_in_minutes: 600 timeout_in_minutes: 600
commands: commands:
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry: retry:
automatic: automatic:
- exit_status: -1 # Agent was lost - exit_status: -1 # Agent was lost
......
...@@ -11,10 +11,10 @@ REPO=$2 ...@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
...@@ -24,13 +24,11 @@ fi ...@@ -24,13 +24,11 @@ fi
# build # build
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg VLLM_CPU_AVX512BF16=true \ --build-arg VLLM_CPU_X86=true \
--build-arg VLLM_CPU_AVX512VNNI=true \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
# push # push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
...@@ -11,10 +11,10 @@ REPO=$2 ...@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
...@@ -24,10 +24,10 @@ fi ...@@ -24,10 +24,10 @@ fi
# build # build
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
# push # push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
...@@ -11,10 +11,10 @@ REPO=$2 ...@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
...@@ -25,10 +25,10 @@ fi ...@@ -25,10 +25,10 @@ fi
docker build \ docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
--progress plain \ --progress plain \
https://github.com/vllm-project/vllm-gaudi.git https://github.com/vllm-project/vllm-gaudi.git
# push # push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm. # We can use this script to compute baseline accuracy on chartqa for vllm.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``
...@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \ ...@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks chartqa \ --tasks chartqa \
--batch_size auto \ --batch_size auto \
--apply_chat_template \ --apply_chat_template \
--limit $LIMIT --limit "$LIMIT"
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2" # pip install "lm-eval[api]>=0.4.11"
usage() { usage() {
echo`` echo``
...@@ -20,14 +20,11 @@ usage() { ...@@ -20,14 +20,11 @@ usage() {
echo echo
} }
while getopts "m:b:l:f:t:" OPT; do while getopts "m:l:f:t:" OPT; do
case ${OPT} in case ${OPT} in
m ) m )
MODEL="$OPTARG" MODEL="$OPTARG"
;; ;;
b )
BATCH_SIZE="$OPTARG"
;;
l ) l )
LIMIT="$OPTARG" LIMIT="$OPTARG"
;; ;;
......
...@@ -13,9 +13,10 @@ import os ...@@ -13,9 +13,10 @@ import os
from contextlib import contextmanager from contextlib import contextmanager
import lm_eval import lm_eval
import numpy as np
import yaml import yaml
from vllm.platforms import current_platform
DEFAULT_RTOL = 0.08 DEFAULT_RTOL = 0.08
...@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size): ...@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True," "allow_deprecated_quantization=True,"
) )
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
model_args += "attention_backend=TRITON_ATTN"
env_vars = eval_config.get("env_vars", None) env_vars = eval_config.get("env_vars", None)
with scoped_env_vars(env_vars): with scoped_env_vars(env_vars):
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
...@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): ...@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f"ground_truth={ground_truth:.3f} | " f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}" f"measured={measured_value:.3f} | rtol={rtol}"
) )
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
min_acceptable = ground_truth * (1 - rtol)
success = success and measured_value >= min_acceptable
assert success assert success
...@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co ...@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
......
...@@ -51,5 +51,56 @@ ...@@ -51,5 +51,56 @@
"max-model-len": 256, "max-model-len": 256,
"async-scheduling": "" "async-scheduling": ""
} }
},
{
"test_name": "latency_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"max-model-len": 2048,
"dtype": "bfloat16"
}
},
{
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"max-model-len": 512,
"max-num-seqs": 128,
"async-scheduling": "",
"gpu-memory-utilization": 0.95,
"enable_expert_parallel": ""
}
},
{
"test_name": "latency_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1,
"max-model-len": 2048,
"max-num-seqs": 128,
"dtype": "bfloat16",
"async-scheduling": ""
}
} }
] ]
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
"dtype": "bfloat16",
"model": "openai/whisper-large-v3-turbo"
},
"client_parameters": {
"model": "openai/whisper-large-v3-turbo",
"backend": "openai-audio",
"endpoint": "/v1/audio/transcriptions",
"dataset_name": "hf",
"dataset_path": "openslr/librispeech_asr",
"hf_subset": "clean",
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}
...@@ -72,17 +72,6 @@ ...@@ -72,17 +72,6 @@
"random-output-len": 128 "random-output-len": 128
} }
}, },
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{ {
"test_name": "serving_llama8B_tp1_random_128_2048", "test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": { "server_parameters": {
...@@ -105,17 +94,6 @@ ...@@ -105,17 +94,6 @@
"random-output-len": 2048 "random-output-len": 2048
} }
}, },
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{ {
"test_name": "serving_llama8B_tp1_random_2048_128", "test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": { "server_parameters": {
...@@ -139,144 +117,25 @@ ...@@ -139,144 +117,25 @@
} }
}, },
{ {
"test_name": "serving_llama8B_tp4_random_2048_128", "test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": { "server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1 "tensor_parallel_size": 1
}, },
"client_parameters": { "client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 128, "random-input-len": 2048,
"random-output-len": 128 "random-output-len": 2048
} }
}, },
{ {
"test_name": "serving_llama8B_int4_tp2_random_128_128", "test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": { "server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2 "tensor_parallel_size": 2
}, },
"client_parameters": { "client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 128, "random-input-len": 2048,
"random-output-len": 128 "random-output-len": 2048
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
} }
} }
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment