Unverified Commit 17852aa5 authored by Louie Tsai's avatar Louie Tsai Committed by GitHub
Browse files

more models for vLLM Benchmark Suite (#35086)


Signed-off-by: default avatarlouie-tsai <louie.tsai@intel.com>
parent 8647c6cf
...@@ -7,12 +7,12 @@ import argparse ...@@ -7,12 +7,12 @@ import argparse
import html as _html import html as _html
import json import json
import os import os
from contextlib import nullcontext
from dataclasses import dataclass from dataclasses import dataclass
from importlib import util from importlib import util
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import regex as re
pd.options.display.float_format = "{:.2f}".format pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None plotly_found = util.find_spec("plotly.express") is not None
...@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2) ...@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: f"{x:.2f}") pd.set_option("display.float_format", lambda x: f"{x:.2f}")
# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if "concurr" in str(c).lower():
s = df[c]
if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
return c
raise ValueError(
"Cannot infer concurrency column. "
"Please rename the column to one of the known names "
"or add an explicit override (e.g., --concurrency-col)."
)
def _normalize_concurrency_in_df(
df: pd.DataFrame, canonical: str = "# of max concurrency."
) -> pd.DataFrame:
if canonical in df.columns:
return df
detected = _find_concurrency_col(df)
if detected in df.columns and detected != canonical:
return df.rename(columns={detected: canonical})
df[canonical] = pd.NA
return df
# ----------------------------- # -----------------------------
# Core data compare # Core data compare
# ----------------------------- # -----------------------------
...@@ -52,19 +91,25 @@ def compare_data_columns( ...@@ -52,19 +91,25 @@ def compare_data_columns(
- Concat along axis=1 (indexes align), then reset_index so callers can - Concat along axis=1 (indexes align), then reset_index so callers can
group by columns. group by columns.
- If --debug, add a <file_label>_name column per file. - If --debug, add a <file_label>_name column per file.
Minimal fix to support different max_concurrency lists across files:
- normalize concurrency column naming to "# of max concurrency."
- align on UNION of keys (missing points become NaN)
- BUGFIX: don't drop throughput rows based on P99/Median presence
""" """
print("\ncompare_data_column:", data_column) print("\ncompare_data_column:", data_column)
frames = [] frames = []
raw_data_cols: list[str] = [] raw_data_cols: list[str] = []
compare_frames = []
# Determine key cols after normalizing concurrency
cols_per_file: list[set] = [] cols_per_file: list[set] = []
for f in files: for f in files:
try: try:
df_tmp = pd.read_json(f, orient="records") df_tmp = pd.read_json(f, orient="records")
except Exception as err: except Exception as err:
raise ValueError(f"Failed to read {f}") from err raise ValueError(f"Failed to read {f}") from err
df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
cols_per_file.append(set(df_tmp.columns)) cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
...@@ -75,12 +120,25 @@ def compare_data_columns( ...@@ -75,12 +120,25 @@ def compare_data_columns(
"No common key columns found from info_cols across the input files." "No common key columns found from info_cols across the input files."
) )
meta_added = False union_index = None
metas: list[pd.DataFrame] = []
staged: list[tuple[str, pd.Series, pd.Series | None]] = []
for file in files: for file in files:
df = pd.read_json(file, orient="records") df = pd.read_json(file, orient="records")
df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
if drop_column in df.columns:
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
# NaN in P99/Median columns even if the column exists in the JSON.
metric_lc = str(data_column).lower()
is_latency_metric = (
"ttft" in metric_lc
or "tpot" in metric_lc
or "p99" in metric_lc
or "median" in metric_lc
or metric_lc.strip() in {"p99", "median"}
)
if is_latency_metric and drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True) df = df.dropna(subset=[drop_column], ignore_index=True)
for c in ( for c in (
...@@ -105,35 +163,61 @@ def compare_data_columns( ...@@ -105,35 +163,61 @@ def compare_data_columns(
meta = meta.groupby(level=key_cols, dropna=False).first() meta = meta.groupby(level=key_cols, dropna=False).first()
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
s.name = file_label
if not meta_added: if data_column in df_idx.columns:
frames.append(meta) s = df_idx[data_column]
meta_added = True if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
else:
# keep NA series to preserve meta keys for union_index
s = pd.Series(pd.NA, index=meta.index)
s.name = file_label
name_s = None
if debug and name_column in df_idx.columns: if debug and name_column in df_idx.columns:
name_s = df_idx[name_column] name_s = df_idx[name_column]
if not name_s.index.is_unique: if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first() name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name" name_s.name = f"{file_label}_name"
frames.append(name_s)
frames.append(s) if union_index is None:
union_index = meta.index
else:
union_index = union_index.union(meta.index)
metas.append(meta)
staged.append((file_label, s, name_s))
if union_index is None:
raise ValueError("No data found after loading inputs.")
# meta first (union-aligned): build UNION meta across all files
if metas:
meta_union = pd.concat(metas, axis=0)
# Collapse duplicates on the MultiIndex; keep first non-null per column
meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
frames.append(meta_union.reindex(union_index))
# values + ratios (union-aligned)
metric_series_aligned: list[pd.Series] = []
for file_label, s, name_s in staged:
s_aligned = s.reindex(union_index)
frames.append(s_aligned)
raw_data_cols.append(file_label) raw_data_cols.append(file_label)
compare_frames.append(s) metric_series_aligned.append(s_aligned)
if debug and name_s is not None:
frames.append(name_s.reindex(union_index))
if len(compare_frames) >= 2: if len(metric_series_aligned) >= 2:
base = compare_frames[0] base = metric_series_aligned[0]
current = compare_frames[-1] current = metric_series_aligned[-1]
if "P99" in data_column or "Median" in data_column: if "P99" in str(data_column) or "Median" in str(data_column):
ratio = base / current ratio = base / current
else: else:
ratio = current / base ratio = current / base
ratio = ratio.mask(base == 0) ratio = ratio.mask(base == 0)
ratio.name = f"Ratio 1 vs {len(compare_frames)}" ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
frames.append(ratio) frames.append(ratio)
concat_df = pd.concat(frames, axis=1).reset_index(drop=True) concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
...@@ -204,24 +288,10 @@ def split_json_by_tp_pp( ...@@ -204,24 +288,10 @@ def split_json_by_tp_pp(
# ----------------------------- # -----------------------------
# Styling helpers # Styling helpers
# ----------------------------- # -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
return "# of max concurrency."
def _highlight_threshold( def _highlight_threshold(
df: pd.DataFrame, threshold: float df: pd.DataFrame,
threshold: float,
slack_pct: float = 0.0,
) -> pd.io.formats.style.Styler: ) -> pd.io.formats.style.Styler:
conc_col = _find_concurrency_col(df) conc_col = _find_concurrency_col(df)
key_cols = [ key_cols = [
...@@ -234,12 +304,24 @@ def _highlight_threshold( ...@@ -234,12 +304,24 @@ def _highlight_threshold(
] ]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
return df.style.map( try:
lambda v: "background-color:#e6ffe6;font-weight:bold;" slack_pct = float(slack_pct or 0.0)
if pd.notna(v) and v <= threshold except Exception:
else "", slack_pct = 0.0
subset=conf_cols, slack_limit = threshold * (1.0 + slack_pct / 100.0)
)
def _cell(v):
if pd.isna(v):
return ""
if v <= threshold:
# Strict SLA
return "background-color:#e6ffe6;font-weight:bold;"
if v <= slack_limit:
# Within slack range
return "background-color:#ffe5cc;font-weight:bold;"
return ""
return df.style.map(_cell, subset=conf_cols)
def highlight_ratio_columns(styler: pd.io.formats.style.Styler): def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
...@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str: ...@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str:
- max 31 chars - max 31 chars
- cannot contain: : \ / ? * [ ] - cannot contain: : \ / ? * [ ]
- cannot be empty - cannot be empty
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
module's compile overhead/edge-cases on some systems.
""" """
name = "sheet" if name is None else str(name) name = "sheet" if name is None else str(name)
name = re.sub(r"[:\\/?*\[\]]", "_", name)
# Replace illegal characters with underscore.
trans = str.maketrans(
{
":": "_",
"\\": "_",
"/": "_",
"?": "_",
"*": "_",
"[": "_",
"]": "_",
}
)
name = name.translate(trans)
# Strip quotes/spaces and collapse whitespace.
name = name.strip().strip("'") name = name.strip().strip("'")
name = re.sub(r"\s+", " ", name) name = " ".join(name.split())
if not name: if not name:
name = "sheet" name = "sheet"
return name[:31] return name[:31]
...@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str: ...@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str:
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str: def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple)) d = dict(zip(group_cols, gkey_tuple))
model = d.get("Model", "model")
model_short = str(model).split("/")[-1] # Always keep input/output lengths (these are important).
ilen = d.get("Input Len", "") ilen = d.get("Input Len", "")
olen = d.get("Output Len", "") olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else "" lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
# Shorten model name aggressively to make room for lens.
model = d.get("Model", "model")
leaf = str(model).split("/")[-1]
max_model_len = max(1, 31 - len(lens))
model_short = leaf[:max_model_len]
return _sanitize_sheet_name(f"{model_short}{lens}") return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet( def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]] writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
): ):
startrow = 0 """Write all blocks to a sheet with a single to_excel() call.
Pandas+openpyxl can be extremely slow when called many times per sheet.
We flatten blocks into one table with a 'Section' column to keep structure
while making Excel generation fast and deterministic.
"""
if not blocks:
pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
return
combined_parts: list[pd.DataFrame] = []
for title, df in blocks: for title, df in blocks:
pd.DataFrame([[title]]).to_excel( df2 = df.copy()
writer, sheet_name=sheet, index=False, header=False, startrow=startrow # Put the section label as the first column for readability.
) df2.insert(0, "Section", title)
startrow += 1 combined_parts.append(df2)
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
startrow += len(df) + 3 combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
combined.to_excel(writer, sheet_name=sheet, index=False)
def _safe_filename(s: str) -> str: def _safe_filename(s: str) -> str:
s = re.sub(r"[^\w\-.]+", "_", str(s).strip()) # Fast path without the third-party `regex` module.
return s[:180] if len(s) > 180 else s s = " ".join(str(s).strip().split())
allowed = []
for ch in s:
if ch.isalnum() or ch in "._-":
allowed.append(ch)
else:
allowed.append("_")
out = "".join(allowed)
return out[:180] if len(out) > 180 else out
# ----------------------------- # -----------------------------
...@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]: ...@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
def _max_concurrency_ok( def _max_concurrency_ok(
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float df: pd.DataFrame,
conc_col: str,
cfg_col: str,
threshold: float,
slack_pct: float = 0.0,
): ):
if df is None or conc_col not in df.columns or cfg_col not in df.columns: if df is None or conc_col not in df.columns or cfg_col not in df.columns:
return pd.NA return pd.NA
...@@ -441,7 +573,14 @@ def _max_concurrency_ok( ...@@ -441,7 +573,14 @@ def _max_concurrency_ok(
if d.empty: if d.empty:
return pd.NA return pd.NA
ok = d[d[cfg_col] <= threshold] # Accept values up to (1 + slack_pct%) above the SLA.
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
ok = d[d[cfg_col] <= effective_limit]
if ok.empty: if ok.empty:
return pd.NA return pd.NA
...@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html( ...@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
if not cfg_cols: if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = [] rows = []
for cfg in cfg_cols: for cfg in cfg_cols:
ttft_max = ( ttft_max = (
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) _max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
if ttft_group_df is not None if ttft_group_df is not None
else pd.NA else pd.NA
) )
tpot_max = ( tpot_max = (
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) _max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
if tpot_group_df is not None if tpot_group_df is not None
else pd.NA else pd.NA
) )
...@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html( ...@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
rows.append( rows.append(
{ {
"Configuration": cfg, "Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max, f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max, f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (Both)": both, f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both, "Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both, "TTFT @ Both (ms)": ttft_at_both,
...@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df( ...@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df(
if not cfg_cols: if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = [] rows = []
for cfg in cfg_cols: for cfg in cfg_cols:
ttft_max = ( ttft_max = (
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) _max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
if ttft_group_df is not None if ttft_group_df is not None
else pd.NA else pd.NA
) )
tpot_max = ( tpot_max = (
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) _max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
if tpot_group_df is not None if tpot_group_df is not None
else pd.NA else pd.NA
) )
...@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df( ...@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df(
rows.append( rows.append(
{ {
"Configuration": cfg, "Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max, f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max, f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (Both)": both, f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both, "Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both, "TTFT @ Both (ms)": ttft_at_both,
...@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser: ...@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser:
help="Reference limit for TPOT plots (ms)", help="Reference limit for TPOT plots (ms)",
) )
# ---- NEW: export options ---- # ---- SLA tolerance (slack) options ----
parser.add_argument(
"--ttft-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TTFT SLA (default: 5).",
)
parser.add_argument(
"--tpot-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TPOT SLA (default: 5).",
)
# ---- export options ----
parser.add_argument( parser.add_argument(
"--excel-out", "--excel-out",
type=str, type=str,
...@@ -843,9 +1015,13 @@ def render_metric_table_html( ...@@ -843,9 +1015,13 @@ def render_metric_table_html(
metric_name = metric_label.lower() metric_name = metric_label.lower()
if "ttft" in metric_name: if "ttft" in metric_name:
styler = _highlight_threshold(display_group, args.ttft_max_ms) styler = _highlight_threshold(
display_group, args.ttft_max_ms, args.ttft_slack_pct
)
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
styler = _highlight_threshold(display_group, args.tpot_max_ms) styler = _highlight_threshold(
display_group, args.tpot_max_ms, args.tpot_slack_pct
)
else: else:
styler = display_group.style styler = display_group.style
...@@ -962,22 +1138,46 @@ def write_report_group_first( ...@@ -962,22 +1138,46 @@ def write_report_group_first(
csv_dir.mkdir(parents=True, exist_ok=True) csv_dir.mkdir(parents=True, exist_ok=True)
excel_path = args.excel_out or "perf_comparison.xlsx" excel_path = args.excel_out or "perf_comparison.xlsx"
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw: disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
excel_engine = (
os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
)
if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
excel_engine = "openpyxl"
excel_engine_kwargs = {}
if excel_engine == "xlsxwriter":
# Reduce memory pressure & usually faster writes.
excel_engine_kwargs = {"options": {"constant_memory": True}}
xw_ctx = (
nullcontext(None)
if disable_excel
else pd.ExcelWriter(
excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
)
)
with xw_ctx as xw:
used_sheets: set[str] = set()
# ---- Environment sheet (first) ---- # ---- Environment sheet (first) ----
env_sheet = _sanitize_sheet_name("Environment") env_sheet = _sanitize_sheet_name("Environment")
env_df = _load_env_df_for_inputs(args, files) env_df = _load_env_df_for_inputs(args, files)
if env_df is None or env_df.empty: if xw is not None:
pd.DataFrame( if env_df is None or env_df.empty:
[ pd.DataFrame(
{ [
"Section": "Environment", {
"Key": "vllm_env.txt", "Section": "Environment",
"Value": "NOT FOUND (or empty)", "Key": "vllm_env.txt",
} "Value": "NOT FOUND (or empty)",
] }
).to_excel(xw, sheet_name=env_sheet, index=False) ]
else: ).to_excel(xw, sheet_name=env_sheet, index=False)
env_df.to_excel(xw, sheet_name=env_sheet, index=False) else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
used_sheets.add(env_sheet)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n') main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys: for gkey in group_keys:
...@@ -993,12 +1193,19 @@ def write_report_group_first( ...@@ -993,12 +1193,19 @@ def write_report_group_first(
main_fh.write(group_header) main_fh.write(group_header)
do_excel = xw is not None
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple) sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sheet_base = sheet sheet_base = sheet
dedup_i = 1 if do_excel:
while sheet in xw.sheets: dedup_i = 1
dedup_i += 1 while sheet in used_sheets:
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}") dedup_i += 1
suffix = f"_{dedup_i}"
# Ensure uniqueness even when sheet names are truncated.
base = str(sheet_base)
keep = max(1, 31 - len(suffix))
sheet = _sanitize_sheet_name(base[:keep] + suffix)
used_sheets.add(sheet)
excel_blocks: list[tuple[str, pd.DataFrame]] = [] excel_blocks: list[tuple[str, pd.DataFrame]] = []
...@@ -1059,7 +1266,7 @@ def write_report_group_first( ...@@ -1059,7 +1266,7 @@ def write_report_group_first(
) )
excel_blocks.append( excel_blocks.append(
(metric_label, display_group.reset_index(drop=True)) (metric_label, group_df.reset_index(drop=True))
) )
if csv_dir: if csv_dir:
fn = _safe_filename( fn = _safe_filename(
...@@ -1067,7 +1274,7 @@ def write_report_group_first( ...@@ -1067,7 +1274,7 @@ def write_report_group_first(
"/", "_" "/", "_"
) )
) )
display_group.to_csv(csv_dir / f"{fn}.csv", index=False) group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html( summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df, tput_group_df=tput_group_df,
...@@ -1097,9 +1304,13 @@ def write_report_group_first( ...@@ -1097,9 +1304,13 @@ def write_report_group_first(
) )
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False) summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
_write_tables_to_excel_sheet(xw, sheet, excel_blocks) if do_excel:
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
print(f"Wrote Excel: {excel_path}") if disable_excel:
print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
else:
print(f"Wrote Excel: {excel_path}")
if csv_dir: if csv_dir:
print(f"Wrote CSVs under: {csv_dir}") print(f"Wrote CSVs under: {csv_dir}")
......
...@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}" ...@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}" MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}" DTYPE_FILTER="${DTYPE_FILTER:-}"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
check_gpus() { check_gpus() {
if command -v nvidia-smi; then if command -v nvidia-smi; then
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
...@@ -183,6 +190,304 @@ upload_to_buildkite() { ...@@ -183,6 +190,304 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving() {
local test_name=$1
local qps=$2
local max_concurrency=$3
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
}
extract_metric_ms() {
local metric_name=$1
local json_file=$2
[[ -f "$json_file" ]] || return 0
if [[ "$metric_name" == "ttft" ]]; then
jq -r '
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
else
jq -r '
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
fi
}
evaluate_sla_from_json() {
local json_file=$1
local ttft
local tpot
local pass
[[ -f "$json_file" ]] || return 2
ttft=$(extract_metric_ms ttft "$json_file")
tpot=$(extract_metric_ms tpot "$json_file")
[[ -n "$ttft" && -n "$tpot" ]] || return 2
pass=$(jq -n \
--argjson ttft "$ttft" \
--argjson tpot "$tpot" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
[[ "$pass" == "true" ]]
}
write_adaptive_summary_json() {
local summary_file=$1
local test_name=$2
local qps=$3
local static_last_pass=$4
local static_first_fail=$5
local final_last_pass=$6
local final_first_fail=$7
jq -n \
--arg test_name "$test_name" \
--arg qps "$qps" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
--arg static_last_pass "${static_last_pass:-}" \
--arg static_first_fail "${static_first_fail:-}" \
--arg final_last_pass "${final_last_pass:-}" \
--arg final_first_fail "${final_first_fail:-}" \
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}' > "$summary_file"
}
run_single_serving_probe() {
local test_name=$1
local qps=$2
local max_concurrency=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
local result_json
local num_prompts_arg=""
local client_command
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
if [[ -f "$result_json" ]]; then
evaluate_sla_from_json "$result_json"
return $?
fi
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
$client_args_effective $client_remote_args "
echo "Adaptive probe: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
evaluate_sla_from_json "$result_json"
}
adaptive_refine_from_static_results() {
local test_name=$1
local qps=$2
local max_concurrency_list_raw=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local sorted_points
local point
local rc
local static_last_pass=""
local static_first_fail=""
local largest_static=""
local step_hint=1
local previous_point=""
local low
local high
local mid
local probes=0
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
[[ -n "$sorted_points" ]] || return 0
while read -r point; do
[[ -z "$point" ]] && continue
largest_static="$point"
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
rc=$?
if (( rc == 0 )); then
static_last_pass="$point"
elif (( rc == 1 )); then
if [[ -n "$static_last_pass" ]]; then
static_first_fail="$point"
break
fi
fi
if [[ -n "$previous_point" ]]; then
step_hint=$(( point - previous_point ))
if (( step_hint < 1 )); then step_hint=1; fi
fi
previous_point="$point"
done <<< "$sorted_points"
if [[ -z "$static_last_pass" ]]; then
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
return 0
fi
if [[ -n "$static_first_fail" ]]; then
low=$static_last_pass
high=$static_first_fail
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
return 0
fi
low=$largest_static
high=""
while (( probes < ADAPTIVE_MAX_PROBES )); do
point=$(( low + step_hint ))
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
point=$ADAPTIVE_MAX_CONCURRENCY
fi
(( point > low )) || break
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$point" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$point
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
step_hint=$(( step_hint * 2 ))
if (( step_hint < 1 )); then step_hint=1; fi
elif (( rc == 1 )); then
high=$point
break
else
break
fi
done
if [[ -n "$high" ]]; then
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
fi
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
}
run_benchmark_tests() { run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command # run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput) # $1: test type (latency or throughput)
...@@ -347,10 +652,48 @@ run_serving_tests() { ...@@ -347,10 +652,48 @@ run_serving_tests() {
server_envs=$(echo "$params" | jq -r '.server_environment_variables') server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params") # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model=$(echo "$server_params" | jq -r '.model // empty')
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
exit 1
fi
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
server_args=$(json2args "$server_params_no_model")
server_envs=$(json2envs "$server_envs") server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np="$(
printf ' %s ' "$client_args" \
| sed -E \
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)"
# normalize whitespace
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
client_args_effective="$client_args_no_np"
else
client_args_effective="$client_args"
fi
# qps_list # qps_list
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
...@@ -382,14 +725,13 @@ run_serving_tests() { ...@@ -382,14 +725,13 @@ run_serving_tests() {
fi fi
# check if server model and client model is aligned # check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $test_name." echo "Server model and client model must be the same. Skip testcase $test_name."
continue continue
fi fi
server_command="$server_envs vllm serve \ server_command="$server_envs vllm serve $server_model \
$server_args" $server_args"
# run the server # run the server
...@@ -436,6 +778,14 @@ run_serving_tests() { ...@@ -436,6 +778,14 @@ run_serving_tests() {
for max_concurrency in $max_concurrency_list; do for max_concurrency in $max_concurrency_list; do
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}" new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
echo " new test name $new_test_name" echo " new test name $new_test_name"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg=""
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
# pass the tensor parallel size, the compilation mode, and the optimization # pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard # level to the client so that they can be used on the benchmark dashboard
client_command="vllm bench serve \ client_command="vllm bench serve \
...@@ -444,8 +794,9 @@ run_serving_tests() { ...@@ -444,8 +794,9 @@ run_serving_tests() {
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \
--request-rate $qps \ --request-rate $qps \
--max-concurrency $max_concurrency \ --max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \ --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
$client_args $client_remote_args " $client_args_effective $client_remote_args "
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
...@@ -467,6 +818,11 @@ run_serving_tests() { ...@@ -467,6 +818,11 @@ run_serving_tests() {
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done done
adaptive_refine_from_static_results \
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
done done
# clean up # clean up
...@@ -532,6 +888,7 @@ main() { ...@@ -532,6 +888,7 @@ main() {
# postprocess benchmarking results # postprocess benchmarking results
pip install tabulate pandas pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
upload_to_buildkite upload_to_buildkite
} }
......
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
"dtype": "bfloat16",
"model": "openai/whisper-large-v3-turbo"
},
"client_parameters": {
"model": "openai/whisper-large-v3-turbo",
"backend": "openai-audio",
"endpoint": "/v1/audio/transcriptions",
"dataset_name": "hf",
"dataset_path": "openslr/librispeech_asr",
"hf_subset": "clean",
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}
...@@ -149,6 +149,39 @@ ...@@ -149,6 +149,39 @@
"random-output-len": 128 "random-output-len": 128
} }
}, },
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{ {
"test_name": "serving_llama8B_int4_tp1_random_128_128", "test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": { "server_parameters": {
...@@ -188,6 +221,45 @@ ...@@ -188,6 +221,45 @@
"random-output-len": 128 "random-output-len": 128
} }
}, },
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{ {
"test_name": "serving_llama3B_tp1_random_128_128", "test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": { "server_parameters": {
......
...@@ -72,17 +72,6 @@ ...@@ -72,17 +72,6 @@
"random-output-len": 128 "random-output-len": 128
} }
}, },
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{ {
"test_name": "serving_llama8B_tp1_random_128_2048", "test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": { "server_parameters": {
...@@ -106,20 +95,20 @@ ...@@ -106,20 +95,20 @@
} }
}, },
{ {
"test_name": "serving_llama8B_tp4_random_128_2048", "test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": { "server_parameters": {
"tensor_parallel_size": 4 "tensor_parallel_size": 1
}, },
"client_parameters": { "client_parameters": {
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 128, "random-input-len": 2048,
"random-output-len": 2048 "random-output-len": 128
} }
}, },
{ {
"test_name": "serving_llama8B_tp1_random_2048_128", "test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": { "server_parameters": {
"tensor_parallel_size": 1 "tensor_parallel_size": 2
}, },
"client_parameters": { "client_parameters": {
"dataset_name": "random", "dataset_name": "random",
...@@ -128,25 +117,25 @@ ...@@ -128,25 +117,25 @@
} }
}, },
{ {
"test_name": "serving_llama8B_tp2_random_2048_128", "test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": { "server_parameters": {
"tensor_parallel_size": 2 "tensor_parallel_size": 1
}, },
"client_parameters": { "client_parameters": {
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 2048, "random-input-len": 2048,
"random-output-len": 128 "random-output-len": 2048
} }
}, },
{ {
"test_name": "serving_llama8B_tp4_random_2048_128", "test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": { "server_parameters": {
"tensor_parallel_size": 4 "tensor_parallel_size": 2
}, },
"client_parameters": { "client_parameters": {
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 2048, "random-input-len": 2048,
"random-output-len": 128 "random-output-len": 2048
} }
} }
] ]
......
...@@ -39,6 +39,12 @@ When run, benchmark script generates results under **benchmark/results** folder, ...@@ -39,6 +39,12 @@ When run, benchmark script generates results under **benchmark/results** folder,
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
- `PROMPTS_PER_CONCURRENCY`: Multiplier to compute `num_prompts` for serving tests (`num_prompts = max_concurrency × value`). Overrides JSON `num_prompts`. Default is NULL.
- `ENABLE_ADAPTIVE_CONCURRENCY`: set the value to '1' to enable adaptive SLA-based concurrency search after the static serving max_concurrency sweep. Default value is 0.
- `SLA_TTFT_MS`: default TTFT SLA threshold in milliseconds for adaptive concurrency search. Default value is 3000.
- `SLA_TPOT_MS`: default TPOT SLA threshold in milliseconds for adaptive concurrency search. Default value is 100.
- `ADAPTIVE_MAX_PROBES`: maximum number of extra adaptive search probes. Default value is 8.
- `ADAPTIVE_MAX_CONCURRENCY`: maximum allowed concurrency during adaptive search. Default value is 1024.
### Visualization ### Visualization
......
...@@ -70,4 +70,7 @@ kaldi-native-fbank >= 1.18.7 # required for fireredasr2 test ...@@ -70,4 +70,7 @@ kaldi-native-fbank >= 1.18.7 # required for fireredasr2 test
# Newer versions of datasets require torchcoded, that makes the tests fail in CI because of a missing library. # Newer versions of datasets require torchcoded, that makes the tests fail in CI because of a missing library.
# Older versions are in conflict with teerratorch requirements. # Older versions are in conflict with teerratorch requirements.
datasets>=3.3.0,<=3.6.0 datasets>=3.3.0,<=3.6.0
\ No newline at end of file
openpyxl # required for perf comparison excel report
plotly # required for perf comparison html report
...@@ -202,6 +202,8 @@ email-validator==2.2.0 ...@@ -202,6 +202,8 @@ email-validator==2.2.0
# via pydantic # via pydantic
encodec==0.1.1 encodec==0.1.1
# via vocos # via vocos
et-xmlfile==2.0.0
# via openpyxl
evaluate==0.4.3 evaluate==0.4.3
# via lm-eval # via lm-eval
fastapi==0.128.0 fastapi==0.128.0
...@@ -634,6 +636,8 @@ opencv-python-headless==4.13.0.90 ...@@ -634,6 +636,8 @@ opencv-python-headless==4.13.0.90
# albucore # albucore
# albumentations # albumentations
# mistral-common # mistral-common
openpyxl==3.1.5
# via -r requirements/test.in
opentelemetry-api==1.35.0 opentelemetry-api==1.35.0
# via # via
# opentelemetry-exporter-prometheus # opentelemetry-exporter-prometheus
...@@ -734,7 +738,9 @@ platformdirs==4.3.6 ...@@ -734,7 +738,9 @@ platformdirs==4.3.6
# virtualenv # virtualenv
# wandb # wandb
plotly==5.24.1 plotly==5.24.1
# via genai-perf # via
# -r requirements/test.in
# genai-perf
pluggy==1.5.0 pluggy==1.5.0
# via # via
# pytest # pytest
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment