Unverified Commit 55a1a956 authored by Louie Tsai's avatar Louie Tsai Committed by GitHub
Browse files

Vllm CPU benchmark suite improvement (#34128)


Signed-off-by: default avatarlouie-tsai <louie.tsai@intel.com>
parent 386bfe5d
...@@ -9,8 +9,10 @@ import json ...@@ -9,8 +9,10 @@ import json
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from importlib import util from importlib import util
from pathlib import Path
import pandas as pd import pandas as pd
import regex as re
pd.options.display.float_format = "{:.2f}".format pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None plotly_found = util.find_spec("plotly.express") is not None
...@@ -275,6 +277,131 @@ def _apply_two_decimals( ...@@ -275,6 +277,131 @@ def _apply_two_decimals(
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="") return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def _sanitize_sheet_name(name: str) -> str:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
"""
name = "sheet" if name is None else str(name)
name = re.sub(r"[:\\/?*\[\]]", "_", name)
name = name.strip().strip("'")
name = re.sub(r"\s+", " ", name)
if not name:
name = "sheet"
return name[:31]
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple))
model = d.get("Model", "model")
model_short = str(model).split("/")[-1]
ilen = d.get("Input Len", "")
olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
startrow = 0
for title, df in blocks:
pd.DataFrame([[title]]).to_excel(
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
)
startrow += 1
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
startrow += len(df) + 3
def _safe_filename(s: str) -> str:
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
return s[:180] if len(s) > 180 else s
# -----------------------------
# vLLM environment export helper
# -----------------------------
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
section = "General"
rows: list[dict] = []
def set_section(s: str):
nonlocal section
s = (s or "").strip()
if s:
section = s
for raw in lines:
stripped = raw.strip()
if not stripped:
continue
# divider lines like =====
if set(stripped) <= {"="}:
continue
# section header heuristic: short standalone line
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
if stripped.lower().startswith("collecting environment information"):
continue
set_section(stripped)
continue
# env var style: KEY=VALUE (and not a URL with :)
if "=" in stripped and ":" not in stripped:
k, v = stripped.split("=", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
# key: value
if ":" in stripped:
k, v = stripped.split(":", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir: Path | None = None
if getattr(args, "file", None):
base_dir = Path(args.file[0]).resolve().parent
elif files:
base_dir = Path(files[0]).resolve().parent
if base_dir is None:
return None
env_path = base_dir / "vllm_env.txt"
if not env_path.exists():
return None
df = _parse_vllm_env_txt(env_path)
return df
# ----------------------------- # -----------------------------
# Valid max concurrency summary helpers # Valid max concurrency summary helpers
# ----------------------------- # -----------------------------
...@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html( ...@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(
summary_df = pd.DataFrame(rows) summary_df = pd.DataFrame(rows)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for c in summary_df.columns: for c in summary_df.columns:
if c == "Configuration": if c == "Configuration":
continue continue
...@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html( ...@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(
both_col = f"Max {conc_col} (Both)" both_col = f"Max {conc_col} (Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters = {} formatters = {}
for c in summary_df.columns: for c in summary_df.columns:
if c == "Configuration": if c == "Configuration":
continue continue
# default argument binds per-column formatter correctly
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}" formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
styler = summary_df.style.format(formatters) styler = summary_df.style.format(formatters)
...@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html( ...@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
return title + styler.to_html(table_attributes='border="1" class="dataframe"') return title + styler.to_html(table_attributes='border="1" class="dataframe"')
def build_valid_max_concurrency_summary_df(
tput_group_df: pd.DataFrame | None,
ttft_group_df: pd.DataFrame | None,
tpot_group_df: pd.DataFrame | None,
conc_col: str,
args,
) -> pd.DataFrame | None:
if ttft_group_df is None and tpot_group_df is None:
return None
ttft_cols = (
_config_value_columns(ttft_group_df, conc_col)
if ttft_group_df is not None
else []
)
tpot_cols = (
_config_value_columns(tpot_group_df, conc_col)
if tpot_group_df is not None
else []
)
tput_cols = (
_config_value_columns(tput_group_df, conc_col)
if tput_group_df is not None
else []
)
if ttft_group_df is not None and tpot_group_df is not None:
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
if tput_group_df is not None:
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
else:
cfg_cols = ttft_cols or tpot_cols
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
if tpot_group_df is not None
else pd.NA
)
both = (
pd.NA
if (pd.isna(ttft_max) or pd.isna(tpot_max))
else min(ttft_max, tpot_max)
)
tput_at_both = (
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
if tput_group_df is not None
else pd.NA
)
ttft_at_both = (
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
if ttft_group_df is not None
else pd.NA
)
tpot_at_both = (
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
if tpot_group_df is not None
else pd.NA
)
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
"TPOT @ Both (ms)": tpot_at_both,
}
)
df = pd.DataFrame(rows)
for c in df.columns:
if c != "Configuration":
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
# ----------------------------- # -----------------------------
# Plot helper # Plot helper
# ----------------------------- # -----------------------------
...@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser: ...@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
default=100.0, default=100.0,
help="Reference limit for TPOT plots (ms)", help="Reference limit for TPOT plots (ms)",
) )
# ---- NEW: export options ----
parser.add_argument(
"--excel-out",
type=str,
default="perf_comparison.xlsx",
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
)
parser.add_argument(
"--csv-out-dir",
type=str,
default="",
help="If set, write per-group per-metric CSVs into this directory.",
)
return parser return parser
...@@ -657,7 +885,6 @@ def maybe_write_plot( ...@@ -657,7 +885,6 @@ def maybe_write_plot(
markers=True, markers=True,
) )
# Ensure plot hover + y tick labels are also 2 decimals.
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>") fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
fig.update_yaxes(tickformat=".2f") fig.update_yaxes(tickformat=".2f")
...@@ -730,87 +957,151 @@ def write_report_group_first( ...@@ -730,87 +957,151 @@ def write_report_group_first(
for metric_label, (df, _) in metric_cache.items() for metric_label, (df, _) in metric_cache.items()
} }
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
main_fh.write('<meta charset="utf-8">\n') if csv_dir:
for gkey in group_keys: csv_dir.mkdir(parents=True, exist_ok=True)
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple) excel_path = args.excel_out or "perf_comparison.xlsx"
sub_path = group_filename(gkey_tuple) with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
group_header = ( # ---- Environment sheet (first) ----
'<div style="font-size: 1.4em; font-weight: 700; ' env_sheet = _sanitize_sheet_name("Environment")
'margin: 18px 0 10px 0;">' env_df = _load_env_df_for_inputs(args, files)
f"{_html.escape(suffix)}" if env_df is None or env_df.empty:
"</div>\n" pd.DataFrame(
) [
{
"Section": "Environment",
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)",
}
]
).to_excel(xw, sheet_name=env_sheet, index=False)
else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
)
main_fh.write(group_header) main_fh.write(group_header)
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n') sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sub_fh.write(group_header) sheet_base = sheet
tput_group_df = None dedup_i = 1
ttft_group_df = None while sheet in xw.sheets:
tpot_group_df = None dedup_i += 1
conc_col = args.xaxis sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
for metric_label in plan.data_cols: excel_blocks: list[tuple[str, pd.DataFrame]] = []
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label] with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
try: sub_fh.write(group_header)
group_df = gb.get_group(gkey) tput_group_df = None
except KeyError: ttft_group_df = None
missing = ( tpot_group_df = None
'<div style="font-size: 1.1em; font-weight: 600; ' conc_col = args.xaxis
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group" for metric_label in plan.data_cols:
"</div>\n" gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
main_fh.write(missing)
sub_fh.write(missing)
continue
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
) )
main_fh.write(missing) html = render_metric_table_html(
sub_fh.write(missing) display_group, metric_label, suffix, args
continue )
main_fh.write(html)
if conc_col not in group_df.columns: sub_fh.write(html)
conc_col = _find_concurrency_col(group_df)
maybe_write_plot(
mn = metric_label.lower().strip() main_fh,
if "tok/s" in mn: sub_fh,
tput_group_df = group_df group_df=group_df,
elif "ttft" in mn: raw_data_cols=raw_data_cols,
ttft_group_df = group_df metric_label=metric_label,
elif mn in ("p99", "median") or "tpot" in mn: y_axis_col=y_axis_col,
tpot_group_df = group_df args=args,
)
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
html = render_metric_table_html( excel_blocks.append(
display_group, metric_label, suffix, args (metric_label, display_group.reset_index(drop=True))
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__{metric_label}".replace(" ", "_").replace(
"/", "_"
)
)
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
) )
main_fh.write(html) if summary_html:
sub_fh.write(html) main_fh.write(summary_html)
sub_fh.write(summary_html)
maybe_write_plot(
main_fh, summary_df = build_valid_max_concurrency_summary_df(
sub_fh, tput_group_df=tput_group_df,
group_df=group_df, ttft_group_df=ttft_group_df,
raw_data_cols=raw_data_cols, tpot_group_df=tpot_group_df,
metric_label=metric_label, conc_col=conc_col,
y_axis_col=y_axis_col,
args=args, args=args,
) )
if summary_df is not None:
excel_blocks.append(
("Valid Max Concurrency Summary", summary_df)
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__Valid_Max_Concurrency_Summary"
)
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html( _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df, print(f"Wrote Excel: {excel_path}")
tpot_group_df=tpot_group_df, if csv_dir:
conc_col=conc_col, print(f"Wrote CSVs under: {csv_dir}")
args=args,
)
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
def main(): def main():
......
#!/bin/bash #!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory # This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/ # Benchmarking results will be available inside vllm/benchmarks/results/
...@@ -9,6 +7,11 @@ ...@@ -9,6 +7,11 @@
set -x set -x
set -o pipefail set -o pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
check_gpus() { check_gpus() {
if command -v nvidia-smi; then if command -v nvidia-smi; then
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
...@@ -112,13 +115,12 @@ json2envs() { ...@@ -112,13 +115,12 @@ json2envs() {
} }
wait_for_server() { wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local timeout_val="1200" local timeout_val="1200"
timeout "$timeout_val" bash -c ' timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do until curl -sf http://localhost:8000/v1/models >/dev/null; do
sleep 1 sleep 1
done' && return 0 || return 1 done
'
} }
kill_processes_launched_by_current_bash() { kill_processes_launched_by_current_bash() {
...@@ -252,37 +254,16 @@ run_benchmark_tests() { ...@@ -252,37 +254,16 @@ run_benchmark_tests() {
done done
} }
run_latency_tests() { run_latency_tests() { run_benchmark_tests "latency" "$1"; }
run_benchmark_tests "latency" "$1" run_startup_tests() { run_benchmark_tests "startup" "$1"; }
} run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
run_startup_tests() {
run_benchmark_tests "startup" "$1"
}
run_throughput_tests() {
run_benchmark_tests "throughput" "$1"
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests merge_serving_tests_stream() {
jq -c ' # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local serving_test_file="$1"
# shellcheck disable=SC2016
local merged='
if type == "array" then if type == "array" then
# Plain format: test cases array # Plain format: test cases array
.[] .[]
...@@ -304,7 +285,50 @@ run_serving_tests() { ...@@ -304,7 +285,50 @@ run_serving_tests() {
else else
error("Unsupported serving test file format: must be array or object with .tests") error("Unsupported serving test file format: must be array or object with .tests")
end end
' "$serving_test_file" | while read -r params; do '
jq -c "$merged" "$serving_test_file" | \
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
local count
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
if [[ "$count" -eq 0 ]]; then
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
return 0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then if [[ ! "$test_name" =~ ^serving_ ]]; then
...@@ -373,7 +397,7 @@ run_serving_tests() { ...@@ -373,7 +397,7 @@ run_serving_tests() {
echo "Server command: $server_command" echo "Server command: $server_command"
# support remote vllm server # support remote vllm server
client_remote_args="" client_remote_args=""
if [[ -z "${REMOTE_HOST}" ]]; then if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$server_command" & bash -c "$server_command" &
server_pid=$! server_pid=$!
# wait until the server is alive # wait until the server is alive
...@@ -384,6 +408,9 @@ run_serving_tests() { ...@@ -384,6 +408,9 @@ run_serving_tests() {
echo "" echo ""
echo "vLLM failed to start within the timeout period." echo "vLLM failed to start within the timeout period."
fi fi
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
# dry-run: don't start server
echo "Dry Run."
else else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then if [[ ${REMOTE_PORT} ]]; then
...@@ -402,9 +429,7 @@ run_serving_tests() { ...@@ -402,9 +429,7 @@ run_serving_tests() {
for qps in $qps_list; do for qps in $qps_list; do
# remove the surrounding single quote from qps # remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf" qps="inf"
echo "now qps is $qps"
fi fi
# iterate over different max_concurrency # iterate over different max_concurrency
...@@ -425,7 +450,9 @@ run_serving_tests() { ...@@ -425,7 +450,9 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
bash -c "$client_command" if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
...@@ -443,12 +470,15 @@ run_serving_tests() { ...@@ -443,12 +470,15 @@ run_serving_tests() {
done done
# clean up # clean up
kill -9 $server_pid if [[ "${DRY_RUN:-0}" != "1" ]]; then
kill_gpu_processes kill -9 $server_pid
kill_gpu_processes
fi
done done
} }
main() { main() {
local ARCH local ARCH
ARCH='' ARCH=''
if [[ "$ON_CPU" == "1" ]]; then if [[ "$ON_CPU" == "1" ]]; then
...@@ -458,7 +488,13 @@ main() { ...@@ -458,7 +488,13 @@ main() {
check_gpus check_gpus
ARCH="$arch_suffix" ARCH="$arch_suffix"
fi fi
check_hf_token
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if [[ "${DRY_RUN:-0}" != "1" ]]; then
check_hf_token
else
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
...@@ -479,11 +515,16 @@ main() { ...@@ -479,11 +515,16 @@ main() {
# dump vllm info via vllm collect-env # dump vllm info via vllm collect-env
env_output=$(vllm collect-env) env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
if [[ "${DRY_RUN:-0}" == "1" ]]; then
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
exit 0
fi
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}" run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
......
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}
...@@ -148,136 +148,6 @@ ...@@ -148,136 +148,6 @@
"random-input-len": 2048, "random-input-len": 2048,
"random-output-len": 128 "random-output-len": 128
} }
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
} }
] ]
} }
...@@ -176,7 +176,7 @@ For the full and up-to-date list of models validated on CPU platforms, please se ...@@ -176,7 +176,7 @@ For the full and up-to-date list of models validated on CPU platforms, please se
### How to find benchmark configuration examples for supported CPU models? ### How to find benchmark configuration examples for supported CPU models?
For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](../../../.buildkite/performance-benchmarks/tests/serving-tests-cpu.json) For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in cpu test cases as serving-tests-cpu.json. Full test cases for Text-only models, Multi-Modal models and Embedded models are in cpu Text-Only test cases as serving-tests-cpu-text.json, cpu Multi-Modal test cases as serving-tests-cpu-multimodal.json and cpu Embedded test cases as serving-tests-cpu-embed.json.
For details on how these optimized configurations are determined, see: [performance-benchmark-details](../../../.buildkite/performance-benchmarks/README.md#performance-benchmark-details). For details on how these optimized configurations are determined, see: [performance-benchmark-details](../../../.buildkite/performance-benchmarks/README.md#performance-benchmark-details).
To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](../../benchmarking/dashboard.md#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment. To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](../../benchmarking/dashboard.md#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.
...@@ -199,6 +199,28 @@ lscpu | grep "NUMA node(s):" | awk '{print $3}' ...@@ -199,6 +199,28 @@ lscpu | grep "NUMA node(s):" | awk '{print $3}'
For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu) For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
, which publishes default-model CPU results produced using the same Benchmark Suite. , which publishes default-model CPU results produced using the same Benchmark Suite.
#### Dry-Run
For users only need to get the optimized runtime configurations without running benchmark, a Dry-Run mode is provided.
By passing an environment variable DRY_RUN=1 with run-performance-benchmarks.sh,
all commands will be generated under `./benchmark/results/`.
```bash
ON_CPU=1 DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
By providing different JSON file, users can get runtime configurations for different models such as Embedded Models.
```bash
ON_CPU=1 SERVING_JSON=serving-tests-cpu-embed.json DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
By providing MODEL_FILTER and DTYPE_FILTER, only commands for related model ID and Data Type will be generated.
```bash
ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-llama/Llama-3.1-8B-Instruct DTYPE_FILTER=bfloat16 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
### How to decide `VLLM_CPU_OMP_THREADS_BIND`? ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following. - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment