Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
group: Hardware
group: Hardware - AMD Build
steps:
- label: "AMD: :docker: build image"
key: image-build-amd
depends_on: []
device: amd_cpu
no_plugin: true
......@@ -9,7 +10,7 @@ steps:
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
......
......@@ -21,6 +21,20 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Compatibility Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- cmake/cpu_extension.cmake
- setup.py
- vllm/platforms/cpu.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
- label: CPU-Language Generation and Pooling Model Tests
depends_on: []
soft_fail: true
......
......@@ -8,7 +8,7 @@ clean_docker_tag() {
}
print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
exit 1
}
......@@ -142,11 +142,16 @@ resolve_parent_commit() {
print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
# Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local bake_tmp
bake_tmp="$(mktemp -d)"
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
}
#################################
......@@ -154,7 +159,7 @@ print_bake_config() {
#################################
print_instance_info
if [[ $# -lt 7 ]]; then
if [[ $# -lt 5 ]]; then
print_usage_and_exit
fi
......@@ -163,10 +168,8 @@ REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
IMAGE_TAG=$7
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
IMAGE_TAG=$5
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
# build config
TARGET="test-ci"
......@@ -193,8 +196,6 @@ export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args
echo "--- :mag: Arguments"
......@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
......
......@@ -5,8 +5,7 @@ steps:
depends_on: []
timeout_in_minutes: 600
commands:
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
retry:
automatic:
- exit_status: -1 # Agent was lost
......
......@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
......@@ -24,13 +24,11 @@ fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg VLLM_CPU_X86=true \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
......@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
......@@ -24,10 +24,10 @@ fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
......@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
......@@ -25,10 +25,10 @@ fi
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
......@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2"
# pip install "lm-eval[api]>=0.4.11"
usage() {
echo``
......@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT
--limit "$LIMIT"
......@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2"
# pip install "lm-eval[api]>=0.4.11"
usage() {
echo``
......
......@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2"
# pip install "lm-eval[api]>=0.4.11"
usage() {
echo``
......
......@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.9.2"
# pip install "lm-eval[api]>=0.4.11"
usage() {
echo``
......@@ -20,14 +20,11 @@ usage() {
echo
}
while getopts "m:b:l:f:t:" OPT; do
while getopts "m:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
......
......@@ -13,9 +13,10 @@ import os
from contextlib import contextmanager
import lm_eval
import numpy as np
import yaml
from vllm.platforms import current_platform
DEFAULT_RTOL = 0.08
......@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True,"
)
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
model_args += "attention_backend=TRITON_ATTN"
env_vars = eval_config.get("env_vars", None)
with scoped_env_vars(env_vars):
results = lm_eval.simple_evaluate(
......@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}"
)
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
min_acceptable = ground_truth * (1 - rtol)
success = success and measured_value >= min_acceptable
assert success
......@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
......
......@@ -7,8 +7,10 @@ import argparse
import html as _html
import json
import os
from contextlib import nullcontext
from dataclasses import dataclass
from importlib import util
from pathlib import Path
import pandas as pd
......@@ -31,6 +33,45 @@ pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if "concurr" in str(c).lower():
s = df[c]
if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
return c
raise ValueError(
"Cannot infer concurrency column. "
"Please rename the column to one of the known names "
"or add an explicit override (e.g., --concurrency-col)."
)
def _normalize_concurrency_in_df(
df: pd.DataFrame, canonical: str = "# of max concurrency."
) -> pd.DataFrame:
if canonical in df.columns:
return df
detected = _find_concurrency_col(df)
if detected in df.columns and detected != canonical:
return df.rename(columns={detected: canonical})
df[canonical] = pd.NA
return df
# -----------------------------
# Core data compare
# -----------------------------
......@@ -50,19 +91,25 @@ def compare_data_columns(
- Concat along axis=1 (indexes align), then reset_index so callers can
group by columns.
- If --debug, add a <file_label>_name column per file.
Minimal fix to support different max_concurrency lists across files:
- normalize concurrency column naming to "# of max concurrency."
- align on UNION of keys (missing points become NaN)
- BUGFIX: don't drop throughput rows based on P99/Median presence
"""
print("\ncompare_data_column:", data_column)
frames = []
raw_data_cols: list[str] = []
compare_frames = []
# Determine key cols after normalizing concurrency
cols_per_file: list[set] = []
for f in files:
try:
df_tmp = pd.read_json(f, orient="records")
except Exception as err:
raise ValueError(f"Failed to read {f}") from err
df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
......@@ -73,12 +120,25 @@ def compare_data_columns(
"No common key columns found from info_cols across the input files."
)
meta_added = False
union_index = None
metas: list[pd.DataFrame] = []
staged: list[tuple[str, pd.Series, pd.Series | None]] = []
for file in files:
df = pd.read_json(file, orient="records")
if drop_column in df.columns:
df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
# NaN in P99/Median columns even if the column exists in the JSON.
metric_lc = str(data_column).lower()
is_latency_metric = (
"ttft" in metric_lc
or "tpot" in metric_lc
or "p99" in metric_lc
or "median" in metric_lc
or metric_lc.strip() in {"p99", "median"}
)
if is_latency_metric and drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True)
for c in (
......@@ -103,35 +163,61 @@ def compare_data_columns(
meta = meta.groupby(level=key_cols, dropna=False).first()
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
s.name = file_label
if not meta_added:
frames.append(meta)
meta_added = True
if data_column in df_idx.columns:
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
else:
# keep NA series to preserve meta keys for union_index
s = pd.Series(pd.NA, index=meta.index)
s.name = file_label
name_s = None
if debug and name_column in df_idx.columns:
name_s = df_idx[name_column]
if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name"
frames.append(name_s)
frames.append(s)
if union_index is None:
union_index = meta.index
else:
union_index = union_index.union(meta.index)
metas.append(meta)
staged.append((file_label, s, name_s))
if union_index is None:
raise ValueError("No data found after loading inputs.")
# meta first (union-aligned): build UNION meta across all files
if metas:
meta_union = pd.concat(metas, axis=0)
# Collapse duplicates on the MultiIndex; keep first non-null per column
meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
frames.append(meta_union.reindex(union_index))
# values + ratios (union-aligned)
metric_series_aligned: list[pd.Series] = []
for file_label, s, name_s in staged:
s_aligned = s.reindex(union_index)
frames.append(s_aligned)
raw_data_cols.append(file_label)
compare_frames.append(s)
metric_series_aligned.append(s_aligned)
if debug and name_s is not None:
frames.append(name_s.reindex(union_index))
if len(compare_frames) >= 2:
base = compare_frames[0]
current = compare_frames[-1]
if "P99" in data_column or "Median" in data_column:
if len(metric_series_aligned) >= 2:
base = metric_series_aligned[0]
current = metric_series_aligned[-1]
if "P99" in str(data_column) or "Median" in str(data_column):
ratio = base / current
else:
ratio = current / base
ratio = ratio.mask(base == 0)
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
frames.append(ratio)
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
......@@ -202,24 +288,10 @@ def split_json_by_tp_pp(
# -----------------------------
# Styling helpers
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
return "# of max concurrency."
def _highlight_threshold(
df: pd.DataFrame, threshold: float
df: pd.DataFrame,
threshold: float,
slack_pct: float = 0.0,
) -> pd.io.formats.style.Styler:
conc_col = _find_concurrency_col(df)
key_cols = [
......@@ -232,12 +304,24 @@ def _highlight_threshold(
]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
return df.style.map(
lambda v: "background-color:#e6ffe6;font-weight:bold;"
if pd.notna(v) and v <= threshold
else "",
subset=conf_cols,
)
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
slack_limit = threshold * (1.0 + slack_pct / 100.0)
def _cell(v):
if pd.isna(v):
return ""
if v <= threshold:
# Strict SLA
return "background-color:#e6ffe6;font-weight:bold;"
if v <= slack_limit:
# Within slack range
return "background-color:#ffe5cc;font-weight:bold;"
return ""
return df.style.map(_cell, subset=conf_cols)
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
......@@ -275,6 +359,177 @@ def _apply_two_decimals(
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def _sanitize_sheet_name(name: str) -> str:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
module's compile overhead/edge-cases on some systems.
"""
name = "sheet" if name is None else str(name)
# Replace illegal characters with underscore.
trans = str.maketrans(
{
":": "_",
"\\": "_",
"/": "_",
"?": "_",
"*": "_",
"[": "_",
"]": "_",
}
)
name = name.translate(trans)
# Strip quotes/spaces and collapse whitespace.
name = name.strip().strip("'")
name = " ".join(name.split())
if not name:
name = "sheet"
return name[:31]
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple))
# Always keep input/output lengths (these are important).
ilen = d.get("Input Len", "")
olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
# Shorten model name aggressively to make room for lens.
model = d.get("Model", "model")
leaf = str(model).split("/")[-1]
max_model_len = max(1, 31 - len(lens))
model_short = leaf[:max_model_len]
return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
"""Write all blocks to a sheet with a single to_excel() call.
Pandas+openpyxl can be extremely slow when called many times per sheet.
We flatten blocks into one table with a 'Section' column to keep structure
while making Excel generation fast and deterministic.
"""
if not blocks:
pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
return
combined_parts: list[pd.DataFrame] = []
for title, df in blocks:
df2 = df.copy()
# Put the section label as the first column for readability.
df2.insert(0, "Section", title)
combined_parts.append(df2)
combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
combined.to_excel(writer, sheet_name=sheet, index=False)
def _safe_filename(s: str) -> str:
# Fast path without the third-party `regex` module.
s = " ".join(str(s).strip().split())
allowed = []
for ch in s:
if ch.isalnum() or ch in "._-":
allowed.append(ch)
else:
allowed.append("_")
out = "".join(allowed)
return out[:180] if len(out) > 180 else out
# -----------------------------
# vLLM environment export helper
# -----------------------------
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
section = "General"
rows: list[dict] = []
def set_section(s: str):
nonlocal section
s = (s or "").strip()
if s:
section = s
for raw in lines:
stripped = raw.strip()
if not stripped:
continue
# divider lines like =====
if set(stripped) <= {"="}:
continue
# section header heuristic: short standalone line
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
if stripped.lower().startswith("collecting environment information"):
continue
set_section(stripped)
continue
# env var style: KEY=VALUE (and not a URL with :)
if "=" in stripped and ":" not in stripped:
k, v = stripped.split("=", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
# key: value
if ":" in stripped:
k, v = stripped.split(":", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir: Path | None = None
if getattr(args, "file", None):
base_dir = Path(args.file[0]).resolve().parent
elif files:
base_dir = Path(files[0]).resolve().parent
if base_dir is None:
return None
env_path = base_dir / "vllm_env.txt"
if not env_path.exists():
return None
df = _parse_vllm_env_txt(env_path)
return df
# -----------------------------
# Valid max concurrency summary helpers
# -----------------------------
......@@ -301,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
def _max_concurrency_ok(
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
df: pd.DataFrame,
conc_col: str,
cfg_col: str,
threshold: float,
slack_pct: float = 0.0,
):
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
return pd.NA
......@@ -314,7 +573,14 @@ def _max_concurrency_ok(
if d.empty:
return pd.NA
ok = d[d[cfg_col] <= threshold]
# Accept values up to (1 + slack_pct%) above the SLA.
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
ok = d[d[cfg_col] <= effective_limit]
if ok.empty:
return pd.NA
......@@ -380,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
_max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
_max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
if tpot_group_df is not None
else pd.NA
)
......@@ -417,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
......@@ -428,7 +704,6 @@ def build_valid_max_concurrency_summary_html(
summary_df = pd.DataFrame(rows)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for c in summary_df.columns:
if c == "Configuration":
continue
......@@ -436,12 +711,10 @@ def build_valid_max_concurrency_summary_html(
both_col = f"Max {conc_col} (Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters = {}
for c in summary_df.columns:
if c == "Configuration":
continue
# default argument binds per-column formatter correctly
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
styler = summary_df.style.format(formatters)
......@@ -460,6 +733,104 @@ def build_valid_max_concurrency_summary_html(
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
def build_valid_max_concurrency_summary_df(
tput_group_df: pd.DataFrame | None,
ttft_group_df: pd.DataFrame | None,
tpot_group_df: pd.DataFrame | None,
conc_col: str,
args,
) -> pd.DataFrame | None:
if ttft_group_df is None and tpot_group_df is None:
return None
ttft_cols = (
_config_value_columns(ttft_group_df, conc_col)
if ttft_group_df is not None
else []
)
tpot_cols = (
_config_value_columns(tpot_group_df, conc_col)
if tpot_group_df is not None
else []
)
tput_cols = (
_config_value_columns(tput_group_df, conc_col)
if tput_group_df is not None
else []
)
if ttft_group_df is not None and tpot_group_df is not None:
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
if tput_group_df is not None:
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
else:
cfg_cols = ttft_cols or tpot_cols
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
if tpot_group_df is not None
else pd.NA
)
both = (
pd.NA
if (pd.isna(ttft_max) or pd.isna(tpot_max))
else min(ttft_max, tpot_max)
)
tput_at_both = (
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
if tput_group_df is not None
else pd.NA
)
ttft_at_both = (
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
if ttft_group_df is not None
else pd.NA
)
tpot_at_both = (
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
if tpot_group_df is not None
else pd.NA
)
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
"TPOT @ Both (ms)": tpot_at_both,
}
)
df = pd.DataFrame(rows)
for c in df.columns:
if c != "Configuration":
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
# -----------------------------
# Plot helper
# -----------------------------
......@@ -537,6 +908,35 @@ def build_parser() -> argparse.ArgumentParser:
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
# ---- SLA tolerance (slack) options ----
parser.add_argument(
"--ttft-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TTFT SLA (default: 5).",
)
parser.add_argument(
"--tpot-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TPOT SLA (default: 5).",
)
# ---- export options ----
parser.add_argument(
"--excel-out",
type=str,
default="perf_comparison.xlsx",
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
)
parser.add_argument(
"--csv-out-dir",
type=str,
default="",
help="If set, write per-group per-metric CSVs into this directory.",
)
return parser
......@@ -615,9 +1015,13 @@ def render_metric_table_html(
metric_name = metric_label.lower()
if "ttft" in metric_name:
styler = _highlight_threshold(display_group, args.ttft_max_ms)
styler = _highlight_threshold(
display_group, args.ttft_max_ms, args.ttft_slack_pct
)
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
styler = _highlight_threshold(display_group, args.tpot_max_ms)
styler = _highlight_threshold(
display_group, args.tpot_max_ms, args.tpot_slack_pct
)
else:
styler = display_group.style
......@@ -657,7 +1061,6 @@ def maybe_write_plot(
markers=True,
)
# Ensure plot hover + y tick labels are also 2 decimals.
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
fig.update_yaxes(tickformat=".2f")
......@@ -730,87 +1133,186 @@ def write_report_group_first(
for metric_label, (df, _) in metric_cache.items()
}
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
)
main_fh.write(group_header)
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
for metric_label in plan.data_cols:
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
if csv_dir:
csv_dir.mkdir(parents=True, exist_ok=True)
main_fh.write(missing)
sub_fh.write(missing)
continue
excel_path = args.excel_out or "perf_comparison.xlsx"
disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
excel_engine = (
os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
)
if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
excel_engine = "openpyxl"
excel_engine_kwargs = {}
if excel_engine == "xlsxwriter":
# Reduce memory pressure & usually faster writes.
excel_engine_kwargs = {"options": {"constant_memory": True}}
xw_ctx = (
nullcontext(None)
if disable_excel
else pd.ExcelWriter(
excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
)
)
with xw_ctx as xw:
used_sheets: set[str] = set()
# ---- Environment sheet (first) ----
env_sheet = _sanitize_sheet_name("Environment")
env_df = _load_env_df_for_inputs(args, files)
if xw is not None:
if env_df is None or env_df.empty:
pd.DataFrame(
[
{
"Section": "Environment",
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)",
}
]
).to_excel(xw, sheet_name=env_sheet, index=False)
else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
used_sheets.add(env_sheet)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
main_fh.write(group_header)
do_excel = xw is not None
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sheet_base = sheet
if do_excel:
dedup_i = 1
while sheet in used_sheets:
dedup_i += 1
suffix = f"_{dedup_i}"
# Ensure uniqueness even when sheet names are truncated.
base = str(sheet_base)
keep = max(1, 31 - len(suffix))
sheet = _sanitize_sheet_name(base[:keep] + suffix)
used_sheets.add(sheet)
excel_blocks: list[tuple[str, pd.DataFrame]] = []
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
for metric_label in plan.data_cols:
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
main_fh.write(missing)
sub_fh.write(missing)
continue
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
html = render_metric_table_html(
display_group, metric_label, suffix, args
)
main_fh.write(html)
sub_fh.write(html)
maybe_write_plot(
main_fh,
sub_fh,
group_df=group_df,
raw_data_cols=raw_data_cols,
metric_label=metric_label,
y_axis_col=y_axis_col,
args=args,
)
html = render_metric_table_html(
display_group, metric_label, suffix, args
excel_blocks.append(
(metric_label, group_df.reset_index(drop=True))
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__{metric_label}".replace(" ", "_").replace(
"/", "_"
)
)
group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
main_fh.write(html)
sub_fh.write(html)
maybe_write_plot(
main_fh,
sub_fh,
group_df=group_df,
raw_data_cols=raw_data_cols,
metric_label=metric_label,
y_axis_col=y_axis_col,
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
summary_df = build_valid_max_concurrency_summary_df(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_df is not None:
excel_blocks.append(
("Valid Max Concurrency Summary", summary_df)
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__Valid_Max_Concurrency_Summary"
)
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
if do_excel:
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
if disable_excel:
print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
else:
print(f"Wrote Excel: {excel_path}")
if csv_dir:
print(f"Wrote CSVs under: {csv_dir}")
def main():
......
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
......@@ -9,14 +7,26 @@
set -x
set -o pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
check_gpus() {
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
elif command -v hl-smi; then
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
fi
if [[ $gpu_count -gt 0 ]]; then
......@@ -44,7 +54,7 @@ check_cpus() {
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo $numa_count
echo "$numa_count"
else
echo "Need at least 1 NUMA to run benchmarking."
exit 1
......@@ -112,13 +122,12 @@ json2envs() {
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local timeout_val="1200"
timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do
until curl -sf http://localhost:8000/v1/models >/dev/null; do
sleep 1
done' && return 0 || return 1
done
'
}
kill_processes_launched_by_current_bash() {
......@@ -181,6 +190,304 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving() {
local test_name=$1
local qps=$2
local max_concurrency=$3
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
}
extract_metric_ms() {
local metric_name=$1
local json_file=$2
[[ -f "$json_file" ]] || return 0
if [[ "$metric_name" == "ttft" ]]; then
jq -r '
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
else
jq -r '
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
fi
}
evaluate_sla_from_json() {
local json_file=$1
local ttft
local tpot
local pass
[[ -f "$json_file" ]] || return 2
ttft=$(extract_metric_ms ttft "$json_file")
tpot=$(extract_metric_ms tpot "$json_file")
[[ -n "$ttft" && -n "$tpot" ]] || return 2
pass=$(jq -n \
--argjson ttft "$ttft" \
--argjson tpot "$tpot" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
[[ "$pass" == "true" ]]
}
write_adaptive_summary_json() {
local summary_file=$1
local test_name=$2
local qps=$3
local static_last_pass=$4
local static_first_fail=$5
local final_last_pass=$6
local final_first_fail=$7
jq -n \
--arg test_name "$test_name" \
--arg qps "$qps" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
--arg static_last_pass "${static_last_pass:-}" \
--arg static_first_fail "${static_first_fail:-}" \
--arg final_last_pass "${final_last_pass:-}" \
--arg final_first_fail "${final_first_fail:-}" \
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}' > "$summary_file"
}
run_single_serving_probe() {
local test_name=$1
local qps=$2
local max_concurrency=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
local result_json
local num_prompts_arg=""
local client_command
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
if [[ -f "$result_json" ]]; then
evaluate_sla_from_json "$result_json"
return $?
fi
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
$client_args_effective $client_remote_args "
echo "Adaptive probe: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
evaluate_sla_from_json "$result_json"
}
adaptive_refine_from_static_results() {
local test_name=$1
local qps=$2
local max_concurrency_list_raw=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local sorted_points
local point
local rc
local static_last_pass=""
local static_first_fail=""
local largest_static=""
local step_hint=1
local previous_point=""
local low
local high
local mid
local probes=0
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
[[ -n "$sorted_points" ]] || return 0
while read -r point; do
[[ -z "$point" ]] && continue
largest_static="$point"
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
rc=$?
if (( rc == 0 )); then
static_last_pass="$point"
elif (( rc == 1 )); then
if [[ -n "$static_last_pass" ]]; then
static_first_fail="$point"
break
fi
fi
if [[ -n "$previous_point" ]]; then
step_hint=$(( point - previous_point ))
if (( step_hint < 1 )); then step_hint=1; fi
fi
previous_point="$point"
done <<< "$sorted_points"
if [[ -z "$static_last_pass" ]]; then
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
return 0
fi
if [[ -n "$static_first_fail" ]]; then
low=$static_last_pass
high=$static_first_fail
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
return 0
fi
low=$largest_static
high=""
while (( probes < ADAPTIVE_MAX_PROBES )); do
point=$(( low + step_hint ))
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
point=$ADAPTIVE_MAX_CONCURRENCY
fi
(( point > low )) || break
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$point" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$point
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
step_hint=$(( step_hint * 2 ))
if (( step_hint < 1 )); then step_hint=1; fi
elif (( rc == 1 )); then
high=$point
break
else
break
fi
done
if [[ -n "$high" ]]; then
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
fi
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
}
run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
......@@ -252,37 +559,16 @@ run_benchmark_tests() {
done
}
run_latency_tests() {
run_benchmark_tests "latency" "$1"
}
run_startup_tests() {
run_benchmark_tests "startup" "$1"
}
run_throughput_tests() {
run_benchmark_tests "throughput" "$1"
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
run_latency_tests() { run_benchmark_tests "latency" "$1"; }
run_startup_tests() { run_benchmark_tests "startup" "$1"; }
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '
merge_serving_tests_stream() {
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local serving_test_file="$1"
# shellcheck disable=SC2016
local merged='
if type == "array" then
# Plain format: test cases array
.[]
......@@ -304,7 +590,50 @@ run_serving_tests() {
else
error("Unsupported serving test file format: must be array or object with .tests")
end
' "$serving_test_file" | while read -r params; do
'
jq -c "$merged" "$serving_test_file" | \
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
local count
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
if [[ "$count" -eq 0 ]]; then
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
return 0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
......@@ -323,10 +652,48 @@ run_serving_tests() {
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model=$(echo "$server_params" | jq -r '.model // empty')
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
exit 1
fi
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
server_args=$(json2args "$server_params_no_model")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np="$(
printf ' %s ' "$client_args" \
| sed -E \
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)"
# normalize whitespace
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
client_args_effective="$client_args_no_np"
else
client_args_effective="$client_args"
fi
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
......@@ -358,14 +725,13 @@ run_serving_tests() {
fi
# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $test_name."
continue
fi
server_command="$server_envs vllm serve \
server_command="$server_envs vllm serve $server_model \
$server_args"
# run the server
......@@ -373,7 +739,7 @@ run_serving_tests() {
echo "Server command: $server_command"
# support remote vllm server
client_remote_args=""
if [[ -z "${REMOTE_HOST}" ]]; then
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
......@@ -384,6 +750,9 @@ run_serving_tests() {
echo ""
echo "vLLM failed to start within the timeout period."
fi
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
# dry-run: don't start server
echo "Dry Run."
else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then
......@@ -402,15 +771,21 @@ run_serving_tests() {
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
# iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
echo " new test name $new_test_name"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg=""
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
client_command="vllm bench serve \
......@@ -419,13 +794,16 @@ run_serving_tests() {
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
$client_args $client_remote_args "
$client_args_effective $client_remote_args "
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
bash -c "$client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
# record the benchmarking commands
jq_output=$(jq -n \
......@@ -440,15 +818,23 @@ run_serving_tests() {
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
adaptive_refine_from_static_results \
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
done
# clean up
kill -9 $server_pid
kill_gpu_processes
if [[ "${DRY_RUN:-0}" != "1" ]]; then
kill -9 "$server_pid"
kill_gpu_processes
fi
done
}
main() {
local ARCH
ARCH=''
if [[ "$ON_CPU" == "1" ]]; then
......@@ -458,7 +844,13 @@ main() {
check_gpus
ARCH="$arch_suffix"
fi
check_hf_token
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if [[ "${DRY_RUN:-0}" != "1" ]]; then
check_hf_token
else
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
......@@ -479,11 +871,16 @@ main() {
# dump vllm info via vllm collect-env
env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
if [[ "${DRY_RUN:-0}" == "1" ]]; then
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
exit 0
fi
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
......@@ -491,6 +888,7 @@ main() {
# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
upload_to_buildkite
}
......
......@@ -51,5 +51,56 @@
"max-model-len": 256,
"async-scheduling": ""
}
},
{
"test_name": "latency_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"max-model-len": 2048,
"dtype": "bfloat16"
}
},
{
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"max-model-len": 512,
"max-num-seqs": 128,
"async-scheduling": "",
"gpu-memory-utilization": 0.95,
"enable_expert_parallel": ""
}
},
{
"test_name": "latency_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1,
"max-model-len": 2048,
"max-num-seqs": 128,
"dtype": "bfloat16",
"async-scheduling": ""
}
}
]
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
"dtype": "bfloat16",
"model": "openai/whisper-large-v3-turbo"
},
"client_parameters": {
"model": "openai/whisper-large-v3-turbo",
"backend": "openai-audio",
"endpoint": "/v1/audio/transcriptions",
"dataset_name": "hf",
"dataset_path": "openslr/librispeech_asr",
"hf_subset": "clean",
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}
......@@ -72,17 +72,6 @@
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
......@@ -105,17 +94,6 @@
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
......@@ -139,144 +117,25 @@
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
"random-input-len": 2048,
"random-output-len": 2048
}
}
]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment