Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.9
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -60,6 +60,7 @@ def launch_lm_eval(eval_config, tp_size):
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len},"
+        "allow_deprecated_quantization=True,"
    )

    env_vars = eval_config.get("env_vars", None)

--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http

 ## Performance benchmark quick overview

-**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.

 **Benchmarking Duration**: about 1hr.

@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh

 Runtime environment variables:

- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -34,8 +34,9 @@ Runtime environment variables:

 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
-For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
->
+> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
+> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
+
 ### Latency test

 Here is an example of one test inside `latency-tests.json`:
@@ -175,19 +176,6 @@ If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.

-The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
-When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
-If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-
-|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
-| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
+#### Performance Results Comparison  

-A comparison diagram will be generated below the table.
-Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
-<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
+Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
 import argparse
+import html as _html
 import json
 import os
+from dataclasses import dataclass
 from importlib import util

 import pandas as pd
@@ -10,27 +15,49 @@ import pandas as pd
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None

-
+DEFAULT_INFO_COLS = [
+    "Model",
+    "Dataset Name",
+    "Input Len",
+    "Output Len",
+    #    "TP Size",
+    #    "PP Size",
+    "# of max concurrency.",
+    "qps",
+]
+
+# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
+pd.set_option("display.precision", 2)
+pd.set_option("display.float_format", lambda x: f"{x:.2f}")
+
+
+# -----------------------------
+# Core data compare
+# -----------------------------
 def compare_data_columns(
-    files, name_column, data_column, info_cols, drop_column, debug=False
+    files: list[str],
+    name_column: str,
+    data_column: str,
+    info_cols: list[str],
+    drop_column: str,
+    debug: bool = False,
 ):
    """
    Align concatenation by keys derived from info_cols instead of row order.
    - Pick one canonical key list: subset of info_cols present in ALL files.
    - For each file: set index to those keys, aggregate duplicates
-    - (mean for metric, first for names).
+      (mean for metric, first for names).
    - Concat along axis=1 (indexes align), then reset_index so callers can
-    - group by columns.
+      group by columns.
    - If --debug, add a <file_label>_name column per file.
    """
    print("\ncompare_data_column:", data_column)

    frames = []
-    raw_data_cols = []
+    raw_data_cols: list[str] = []
    compare_frames = []

-    # 1) choose a canonical key list from info_cols that exists in ALL files
-    cols_per_file = []
+    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
@@ -40,24 +67,20 @@ def compare_data_columns(

    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
    if not key_cols:
-        # soft fallback: use any info_cols present in the first file
        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
    if not key_cols:
        raise ValueError(
            "No common key columns found from info_cols across the input files."
        )

-    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
    meta_added = False

    for file in files:
        df = pd.read_json(file, orient="records")

-        # Keep rows that actually have the compared metric (same as original behavior)
        if drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)

-        # Stabilize numeric key columns (harmless if missing)
        for c in (
            "Input Len",
            "Output Len",
@@ -69,32 +92,26 @@ def compare_data_columns(
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")

-        # Ensure all key columns exist
        for c in key_cols:
            if c not in df.columns:
                df[c] = pd.NA

-        # Set index = key_cols and aggregate duplicates → unique MultiIndex
        df_idx = df.set_index(key_cols, drop=False)

-        # meta (key columns), unique per key
        meta = df_idx[key_cols]
        if not meta.index.is_unique:
            meta = meta.groupby(level=key_cols, dropna=False).first()

-        # metric series for this file, aggregated to one row per key
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
        s = df_idx[data_column]
        if not s.index.is_unique:
            s = s.groupby(level=key_cols, dropna=False).mean()
-        s.name = file_label  # column label like original
+        s.name = file_label

-        # add meta once (from first file) so keys are the leftmost columns
        if not meta_added:
            frames.append(meta)
            meta_added = True

-        # (NEW) debug: aligned test-name column per file
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
@@ -106,26 +123,19 @@ def compare_data_columns(
        raw_data_cols.append(file_label)
        compare_frames.append(s)

-        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
            if "P99" in data_column or "Median" in data_column:
-                ratio = base / current  # for latency
+                ratio = base / current
            else:
                ratio = current / base
-            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            ratio = ratio.mask(base == 0)
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)

-    # 4) concat on columns with aligned MultiIndex;
-    # then reset_index to return keys as columns
-    concat_df = pd.concat(frames, axis=1)
-    concat_df = concat_df.reset_index(drop=True).reset_index()
-    if "index" in concat_df.columns:
-        concat_df = concat_df.drop(columns=["index"])
+    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)

-    # Ensure key/info columns appear first (in your info_cols order)
    front = [c for c in info_cols if c in concat_df.columns]
    rest = [c for c in concat_df.columns if c not in front]
    concat_df = concat_df[front + rest]
@@ -134,20 +144,15 @@ def compare_data_columns(
    return concat_df, raw_data_cols


+# -----------------------------
+# Split helper
+# -----------------------------
 def split_json_by_tp_pp(
    input_file: str = "benchmark_results.json", output_root: str = "."
 ) -> list[str]:
-    """
-    Split a benchmark JSON into separate folders by (TP Size, PP Size).
-
-    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
-    Returns: list of file paths written.
-    """
-    # Load JSON data into DataFrame
    with open(input_file, encoding="utf-8") as f:
        data = json.load(f)

-    # If the JSON is a dict with a list under common keys, use that list
    if isinstance(data, dict):
        for key in ("results", "serving_results", "benchmarks", "data"):
            if isinstance(data.get(key), list):
@@ -156,7 +161,6 @@ def split_json_by_tp_pp(

    df = pd.DataFrame(data)

-    # Keep only "serving" tests
    name_col = next(
        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
    )
@@ -165,7 +169,6 @@ def split_json_by_tp_pp(
            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
        ].copy()

-    # Handle alias column names
    rename_map = {
        "tp_size": "TP Size",
        "tensor_parallel_size": "TP Size",
@@ -176,21 +179,14 @@ def split_json_by_tp_pp(
        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
    )

-    # Ensure TP/PP columns exist (default to 1 if missing)
    if "TP Size" not in df.columns:
        df["TP Size"] = 1
    if "PP Size" not in df.columns:
        df["PP Size"] = 1

-    # make sure TP/PP are numeric ints with no NaN
-    df["TP Size"] = (
-        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
-    df["PP Size"] = (
-        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
+    df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
+    df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)

-    # Split into separate folders
    saved_paths: list[str] = []
    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
@@ -203,32 +199,9 @@ def split_json_by_tp_pp(
    return saved_paths


-def _add_limit_line(fig, y_value, label):
-    # Visible dashed line + annotation
-    fig.add_hline(
-        y=y_value,
-        line_dash="dash",
-        line_color="red" if "ttft" in label.lower() else "blue",
-        annotation_text=f"{label}: {y_value} ms",
-        annotation_position="top left",
-    )
-    # Optional: add a legend item (as a transparent helper trace)
-    if plot and plotly_found:
-        import plotly.graph_objects as go
-
-        fig.add_trace(
-            go.Scatter(
-                x=[None],
-                y=[None],
-                mode="lines",
-                line=dict(
-                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
-                ),
-                name=f"{label}",
-            )
-        )
-
-
+# -----------------------------
+# Styling helpers
+# -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
@@ -239,7 +212,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
    ]:
        if c in df.columns:
            return c
-    # Fallback: guess an integer-like column (harmless if unused)
    for c in df.columns:
        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
            return c
@@ -248,8 +220,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:

 def _highlight_threshold(
    df: pd.DataFrame, threshold: float
-) -> "pd.io.formats.style.Styler":
-    """Highlight numeric per-configuration columns with value <= threshold."""
+) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
        c
@@ -260,6 +231,7 @@ def _highlight_threshold(
        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+
    return df.style.map(
        lambda v: "background-color:#e6ffe6;font-weight:bold;"
        if pd.notna(v) and v <= threshold
@@ -268,7 +240,264 @@ def _highlight_threshold(
    )


-if __name__ == "__main__":
+def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
+    ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
+    if not ratio_cols:
+        return styler
+
+    styler = styler.apply(
+        lambda _: ["background-color: #fff3b0"] * len(styler.data),
+        subset=ratio_cols,
+        axis=0,
+    )
+
+    styler = styler.set_table_styles(
+        [
+            {
+                "selector": f"th.col_heading.level0.col{i}",
+                "props": [("background-color", "#fff3b0")],
+            }
+            for i, col in enumerate(styler.data.columns)
+            if col in ratio_cols
+        ],
+        overwrite=False,
+    )
+    return styler
+
+
+def _apply_two_decimals(
+    styler: pd.io.formats.style.Styler,
+) -> pd.io.formats.style.Styler:
+    df = styler.data
+    num_cols = df.select_dtypes("number").columns
+    if len(num_cols) == 0:
+        return styler
+    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
+
+
+# -----------------------------
+# Valid max concurrency summary helpers
+# -----------------------------
+def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
+        if c in df.columns
+    ]
+    exclude = set(key_cols + [conc_col, "qps", "QPS"])
+
+    cols: list[str] = []
+    for c in df.columns:
+        if c in exclude:
+            continue
+        lc = str(c).lower()
+        if lc.startswith("ratio"):
+            continue
+        if lc.endswith("_name") or lc == "test name" or lc == "test_name":
+            continue
+        if pd.api.types.is_numeric_dtype(df[c]):
+            cols.append(c)
+    return cols
+
+
+def _max_concurrency_ok(
+    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+):
+    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+    d = d.dropna(subset=[conc_col, cfg_col])
+
+    if d.empty:
+        return pd.NA
+
+    ok = d[d[cfg_col] <= threshold]
+    if ok.empty:
+        return pd.NA
+
+    return ok[conc_col].max()
+
+
+def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
+    if (
+        df is None
+        or conc_col not in df.columns
+        or cfg_col not in df.columns
+        or pd.isna(conc_value)
+    ):
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+
+    conc_value = pd.to_numeric(conc_value, errors="coerce")
+    if pd.isna(conc_value):
+        return pd.NA
+
+    hit = d[d[conc_col] == conc_value]
+    if hit.empty:
+        return pd.NA
+    return hit[cfg_col].iloc[0]
+
+
+def build_valid_max_concurrency_summary_html(
+    tput_group_df: pd.DataFrame | None,
+    ttft_group_df: pd.DataFrame | None,
+    tpot_group_df: pd.DataFrame | None,
+    conc_col: str,
+    args,
+) -> str:
+    if ttft_group_df is None and tpot_group_df is None:
+        return ""
+
+    ttft_cols = (
+        _config_value_columns(ttft_group_df, conc_col)
+        if ttft_group_df is not None
+        else []
+    )
+    tpot_cols = (
+        _config_value_columns(tpot_group_df, conc_col)
+        if tpot_group_df is not None
+        else []
+    )
+    tput_cols = (
+        _config_value_columns(tput_group_df, conc_col)
+        if tput_group_df is not None
+        else []
+    )
+
+    if ttft_group_df is not None and tpot_group_df is not None:
+        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+        if tput_group_df is not None:
+            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+    else:
+        cfg_cols = ttft_cols or tpot_cols
+
+    if not cfg_cols:
+        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+    rows = []
+    for cfg in cfg_cols:
+        ttft_max = (
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_max = (
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+        both = (
+            pd.NA
+            if (pd.isna(ttft_max) or pd.isna(tpot_max))
+            else min(ttft_max, tpot_max)
+        )
+
+        tput_at_both = (
+            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+            if tput_group_df is not None
+            else pd.NA
+        )
+        ttft_at_both = (
+            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_at_both = (
+            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+
+        rows.append(
+            {
+                "Configuration": cfg,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (Both)": both,
+                "Output Tput @ Both (tok/s)": tput_at_both,
+                "TTFT @ Both (ms)": ttft_at_both,
+                "TPOT @ Both (ms)": tpot_at_both,
+            }
+        )
+
+    summary_df = pd.DataFrame(rows)
+
+    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
+
+    both_col = f"Max {conc_col} (Both)"
+
+    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
+    formatters = {}
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        # default argument binds per-column formatter correctly
+        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
+
+    styler = summary_df.style.format(formatters)
+
+    def _green(v):
+        return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
+
+    if both_col in summary_df.columns:
+        styler = styler.map(_green, subset=[both_col])
+
+    title = (
+        '<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
+        "Valid Max Concurrency Summary"
+        "</div>\n"
+    )
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+# -----------------------------
+# Plot helper
+# -----------------------------
+def _add_limit_line(fig, y_value: float, label: str):
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    if plotly_found:
+        import plotly.graph_objects as go
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash",
+                    color="red" if "ttft" in label.lower() else "blue",
+                ),
+                name=label,
+            )
+        )
+
+
+# -----------------------------
+# Refactored main + group-first report
+# -----------------------------
+@dataclass(frozen=True)
+class MetricPlan:
+    data_cols: list[str]
+    drop_column: str
+
+
+def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", action="append", type=str, help="input file name"
@@ -308,149 +537,289 @@ if __name__ == "__main__":
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
+    return parser

-    args = parser.parse_args()

+def choose_metrics(latency: str) -> MetricPlan:
+    latency = (latency or "").lower()
    drop_column = "P99"
-    name_column = "Test name"
-    info_cols = [
-        "Model",
-        "Dataset Name",
-        "Input Len",
-        "Output Len",
-        "TP Size",
-        "PP Size",
-        "# of max concurrency.",
-        "qps",
-    ]

-    if "median" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "Median TTFT /n",
-            "Median TPOT /n",
-        ]
-        drop_column = "P99"
-    elif "p99" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "P99 TTFT /n",
-            "P99 TPOT /n",
-        ]
+    if "median" in latency:
+        return MetricPlan(
+            data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
+            drop_column=drop_column,
+        )
+
+    return MetricPlan(
+        data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
+        drop_column=drop_column,
+    )
+
+
+def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
+    if not args.file:
+        raise ValueError("No input files provided. Use -f/--file.")

    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
    else:
        files = args.file
+
+    return files, info_cols
+
+
+def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
+    y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
+    return info_cols[y_axis_index]
+
+
+def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
+    filtered_info_cols = info_cols[:4]
+    group_cols = [c for c in filtered_info_cols if c in output_df.columns]
+    if not group_cols:
+        raise ValueError(
+            f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
+            f"but DataFrame has: {list(output_df.columns)}"
+        )
+    return group_cols
+
+
+def normalize_group_key(name):
+    return name if isinstance(name, tuple) else (name,)
+
+
+def group_filename(name, prefix: str = "perf_comparison_") -> str:
+    name_vals = normalize_group_key(name)
+    safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
+    return f"{prefix}{safe}.html"
+
+
+def build_group_suffix(group_cols: list[str], name) -> str:
+    name_vals = normalize_group_key(name)
+    return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
+
+
+def render_metric_table_html(
+    display_group: pd.DataFrame,
+    metric_label: str,
+    group_suffix: str,
+    args,
+) -> str:
+    title = (
+        f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
+        f"{_html.escape(metric_label)}"
+        f" — {_html.escape(group_suffix)}"
+        f"</div>\n"
+    )
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+    else:
+        styler = display_group.style
+
+    styler = _apply_two_decimals(styler)
+    styler = highlight_ratio_columns(styler)
+
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+def maybe_write_plot(
+    main_fh,
+    sub_fh,
+    group_df: pd.DataFrame,
+    raw_data_cols: list[str],
+    metric_label: str,
+    y_axis_col: str,
+    args,
+):
+    if not (args.plot and plotly_found):
+        return
+
+    import plotly.express as px
+
+    df = group_df[raw_data_cols].sort_values(by=y_axis_col)
+    df_melted = df.melt(
+        id_vars=y_axis_col,
+        var_name="Configuration",
+        value_name=metric_label,
+    )
+
+    fig = px.line(
+        df_melted,
+        x=y_axis_col,
+        y=metric_label,
+        color="Configuration",
+        title=f"{metric_label} vs {y_axis_col}",
+        markers=True,
+    )
+
+    # Ensure plot hover + y tick labels are also 2 decimals.
+    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
+    fig.update_yaxes(tickformat=".2f")
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+    html = fig.to_html(full_html=True, include_plotlyjs="cdn")
+    main_fh.write(html)
+    sub_fh.write(html)
+
+
+def build_group_keys(
+    df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
+):
+    if sort_cols:
+        df = df.sort_values(by=sort_cols)
+    gb = df.groupby(group_cols, dropna=False)
+    return [k for k, _ in gb]
+
+
+def write_report_group_first(
+    files: list[str], info_cols: list[str], plan: MetricPlan, args
+):
+    name_column = "Test name"
+    y_axis_col = get_y_axis_col(info_cols, args.xaxis)
+
    print("comparing : " + ", ".join(files))
-    debug = args.debug
-    plot = args.plot
-    # For Plot feature, assign y axis from one of info_cols
-    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
-    with open("perf_comparison.html", "w") as text_file:
-        for i in range(len(data_cols_to_compare)):
-            output_df, raw_data_cols = compare_data_columns(
-                files,
-                name_column,
-                data_cols_to_compare[i],
-                info_cols,
-                drop_column,
-                debug=debug,
+
+    metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
+    group_cols_canonical: list[str] | None = None
+
+    for metric_label in plan.data_cols:
+        output_df, raw_data_cols = compare_data_columns(
+            files,
+            name_column,
+            metric_label,
+            info_cols,
+            plan.drop_column,
+            debug=args.debug,
+        )
+
+        raw_data_cols = list(raw_data_cols)
+        raw_data_cols.insert(0, y_axis_col)
+
+        group_cols = get_group_cols(output_df, info_cols)
+        if group_cols_canonical is None:
+            group_cols_canonical = group_cols
+        else:
+            group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
+
+        metric_cache[metric_label] = (
+            output_df.sort_values(by=args.xaxis),
+            raw_data_cols,
+        )
+
+    if not group_cols_canonical:
+        raise ValueError("No canonical group columns found across metrics.")
+
+    first_metric = plan.data_cols[0]
+    first_df_sorted, _ = metric_cache[first_metric]
+    group_keys = build_group_keys(
+        first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
+    )
+
+    metric_groupbys = {
+        metric_label: df.groupby(group_cols_canonical, dropna=False)
+        for metric_label, (df, _) in metric_cache.items()
+    }
+
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+        main_fh.write('<meta charset="utf-8">\n')
+        for gkey in group_keys:
+            gkey_tuple = normalize_group_key(gkey)
+            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+            sub_path = group_filename(gkey_tuple)
+            group_header = (
+                '<div style="font-size: 1.4em; font-weight: 700; '
+                'margin: 18px 0 10px 0;">'
+                f"{_html.escape(suffix)}"
+                "</div>\n"
            )

-            # For Plot feature, insert y axis from one of info_cols
-            raw_data_cols.insert(0, info_cols[y_axis_index])
-
-            filtered_info_cols = info_cols[:-2]
-            existing_group_cols = [
-                c for c in filtered_info_cols if c in output_df.columns
-            ]
-            if not existing_group_cols:
-                raise ValueError(
-                    f"No valid group-by columns  "
-                    f"Expected subset: {filtered_info_cols}, "
-                    f"but DataFrame has: {list(output_df.columns)}"
-                )
-            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
-            output_df_sorted = output_df.sort_values(by=args.xaxis)
-            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
-            for name, group in output_groups:
-                group_name = (
-                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
-                )
-                group_html_name = "perf_comparison_" + group_name + ".html"
-
-                metric_name = str(data_cols_to_compare[i]).lower()
-                if "tok/s" in metric_name:
-                    html = group.to_html()
-                elif "ttft" in metric_name:
-                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
-                    )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
+            main_fh.write(group_header)
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                sub_fh.write('<meta charset="utf-8">\n')
+                sub_fh.write(group_header)
+                tput_group_df = None
+                ttft_group_df = None
+                tpot_group_df = None
+                conc_col = args.xaxis
+
+                for metric_label in plan.data_cols:
+                    gb = metric_groupbys[metric_label]
+                    df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                    try:
+                        group_df = gb.get_group(gkey)
+                    except KeyError:
+                        missing = (
+                            '<div style="font-size: 1.1em; font-weight: 600; '
+                            'margin: 10px 0;">'
+                            f"{_html.escape(metric_label)} — missing for this group"
+                            "</div>\n"
+                        )
+
+                        main_fh.write(missing)
+                        sub_fh.write(missing)
+                        continue
+
+                    if conc_col not in group_df.columns:
+                        conc_col = _find_concurrency_col(group_df)
+
+                    mn = metric_label.lower().strip()
+                    if "tok/s" in mn:
+                        tput_group_df = group_df
+                    elif "ttft" in mn:
+                        ttft_group_df = group_df
+                    elif mn in ("p99", "median") or "tpot" in mn:
+                        tpot_group_df = group_df
+
+                    display_group = group_df.drop(
+                        columns=group_cols_canonical, errors="ignore"
                    )
-                elif (
-                    "tpot" in metric_name
-                    or "median" in metric_name
-                    or "p99" in metric_name
-                ):
-                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
+
+                    html = render_metric_table_html(
+                        display_group, metric_label, suffix, args
                    )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
+                    main_fh.write(html)
+                    sub_fh.write(html)
+
+                    maybe_write_plot(
+                        main_fh,
+                        sub_fh,
+                        group_df=group_df,
+                        raw_data_cols=raw_data_cols,
+                        metric_label=metric_label,
+                        y_axis_col=y_axis_col,
+                        args=args,
                    )

-                text_file.write(html_msgs_for_data_cols[i])
-                text_file.write(html)
-                with open(group_html_name, "a+") as sub_text_file:
-                    sub_text_file.write(html_msgs_for_data_cols[i])
-                    sub_text_file.write(html)
-
-                    if plot and plotly_found:
-                        import plotly.express as px
-
-                        df = group[raw_data_cols]
-                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                        # Melt DataFrame for plotting
-                        df_melted = df_sorted.melt(
-                            id_vars=info_cols[y_axis_index],
-                            var_name="Configuration",
-                            value_name=data_cols_to_compare[i],
-                        )
-                        title = (
-                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                        )
-                        # Create Plotly line chart
-                        fig = px.line(
-                            df_melted,
-                            x=info_cols[y_axis_index],
-                            y=data_cols_to_compare[i],
-                            color="Configuration",
-                            title=title,
-                            markers=True,
-                        )
+                summary_html = build_valid_max_concurrency_summary_html(
+                    tput_group_df=tput_group_df,
+                    ttft_group_df=ttft_group_df,
+                    tpot_group_df=tpot_group_df,
+                    conc_col=conc_col,
+                    args=args,
+                )
+                if summary_html:
+                    main_fh.write(summary_html)
+                    sub_fh.write(summary_html)

-                        # ---- Add threshold lines based on metric name ----
-                        if "ttft" in metric_name:
-                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-                        elif (
-                            "tpot" in metric_name
-                            or "median" in metric_name
-                            or "p99" in metric_name
-                        ):
-                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
-
-                        # Export to HTML
-                        text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
-                        sub_text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
+
+def main():
+    args = build_parser().parse_args()
+    info_cols = list(DEFAULT_INFO_COLS)
+    plan = choose_metrics(args.latency)
+    files, info_cols = prepare_input_files(args, info_cols)
+    write_report_group_first(files, info_cols, plan, args)
+
+
+if __name__ == "__main__":
+    main()
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -49,7 +49,11 @@ check_cpus() {
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type="cpu"
+  if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+    declare -g gpu_type="arm64-cpu"
+  else
+    declare -g gpu_type="cpu"
+  fi
  echo "GPU type is $gpu_type"
 }

@@ -207,8 +211,8 @@ run_latency_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -276,8 +280,8 @@ run_throughput_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -393,8 +397,8 @@ run_serving_tests() {

    # check if there is enough resources to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+    if [[ "$ON_CPU" == "1" ]]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -496,9 +500,9 @@ run_serving_tests() {
 main() {
  local ARCH
  ARCH=''
-  if [ "$ON_CPU" == "1" ];then
-     check_cpus
-     ARCH='-cpu'
+  if [[ "$ON_CPU" == "1" ]]; then
+    check_cpus
+    ARCH="-$gpu_type"
  else
     check_gpus
     ARCH="$arch_suffix"

--- a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      12,
+      16,
+      24,
+      32,
+      64,
+      128,
+      200
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "enforce_eager": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256,
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
\ No newline at end of file
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -19,10 +19,8 @@
      "block_size": 128,
      "trust_remote_code": "",
      "disable_log_stats": "",
-      "enforce_eager": "",
      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256,
-      "load_format": "dummy"
+      "max_num_seqs": 256
    },
    "client_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
@@ -151,6 +149,45 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {

--- a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+            "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dtype": "bfloat16",
+            "distributed_executor_backend": "mp",
+            "block_size": 128,
+            "trust_remote_code": "",
+            "disable_log_stats": "",
+            "enforce_eager": "",
+            "max_num_batched_tokens": 2048,
+            "max_num_seqs": 256,
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
 steps:
  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.9"
+  - label: "Build wheel - aarch64 - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
    agents:
@@ -11,11 +11,11 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build arm64 wheel - CUDA 13.0"
+  - label: "Build wheel - aarch64 - CUDA 13.0"
    depends_on: ~
    id: build-wheel-arm64-cuda-13-0
    agents:
@@ -26,12 +26,12 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

  # aarch64 build
-  - label: "Build arm64 CPU wheel"
+  - label: "Build wheel - aarch64 - CPU"
    depends_on: ~
    id: build-wheel-arm64-cpu
    agents:
@@ -40,39 +40,39 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
+  - label: "Build wheel - x86_64 - CUDA 12.9"
    depends_on: ~
-    id: build-wheel-cuda-12-9
+    id: build-wheel-x86-cuda-12-9
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build wheel - CUDA 13.0"
+  - label: "Build wheel - x86_64 - CUDA 13.0"
    depends_on: ~
-    id: build-wheel-cuda-13-0
+    id: build-wheel-x86-cuda-13-0
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

  # x86 CPU wheel build
-  - label: "Build x86 CPU wheel"
+  - label: "Build wheel - x86_64 - CPU"
    depends_on: ~
    id: build-wheel-x86-cpu
    agents:
@@ -81,12 +81,12 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

-  # Build release images (12.9)
-  - label: "Build release image (x86)"
+  # Build release images (CUDA 12.9)
+  - label: "Build release image - x86_64 - CUDA 12.9"
    depends_on: ~
    id: build-release-image-x86
    agents:
@@ -99,7 +99,7 @@ steps:
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

-  - label: "Build release image (arm64)"
+  - label: "Build release image - aarch64 - CUDA 12.9"
    depends_on: ~
    id: build-release-image-arm64
    agents:
@@ -109,34 +109,92 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"

-  # Add job to create multi-arch manifest
-  - label: "Create multi-arch manifest"
+  - label: "Create multi-arch manifest - CUDA 12.9"
    depends_on:
      - build-release-image-x86
      - build-release-image-arm64
    id: create-multi-arch-manifest
    agents:
-      queue: cpu_queue_postmerge
+      queue: small_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

-  - label: "Annotate release workflow"
+  - label: "Annotate release workflow - CUDA 12.9"
    depends_on:
      - create-multi-arch-manifest
    id: annotate-release-workflow
    agents:
-      queue: cpu_queue_postmerge
+      queue: small_cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-release.sh"

+  - block: "Build CUDA 13.0 release images"
+    key: block-release-image-build-cuda-13-0
+    depends_on: ~
+
+  - label: "Build release image - x86_64 - CUDA 13.0"
+    depends_on: block-release-image-build-cuda-13-0
+    id: build-release-image-x86-cuda-13-0
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+  - label: "Build release image - aarch64 - CUDA 13.0"
+    depends_on: block-release-image-build-cuda-13-0
+    id: build-release-image-arm64-cuda-13-0
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+
+  - label: "Create multi-arch manifest - CUDA 13.0"
+    depends_on:
+      - build-release-image-x86-cuda-13-0
+      - build-release-image-arm64-cuda-13-0
+    id: create-multi-arch-manifest-cuda-13-0
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version

+  - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+    key: block-upload-release-wheels
+    depends_on:
+      - input-release-version
+      - build-wheel-x86-cuda-12-9
+      - build-wheel-x86-cuda-13-0
+      - build-wheel-x86-cpu
+      - build-wheel-arm64-cuda-12-9
+      - build-wheel-arm64-cuda-13-0
+      - build-wheel-arm64-cpu
+
+  - label: "Upload release wheels to PyPI and GitHub"
+    depends_on:
+      - block-upload-release-wheels
+    id: upload-release-wheels
+    agents:
+      queue: small_cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/upload-release-wheels.sh"
+
  - block: "Build CPU release image"
    key: block-cpu-release-image-build
    depends_on: ~
@@ -169,12 +227,30 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - block: "Build ROCm release image"
+    key: block-rocm-release-image-build
+    depends_on: ~
+
+  - label: "Build release image (ROCm)"
+    depends_on: block-rocm-release-image-build
+    id: build-release-image-rocm
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      # Build base image first
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
+      # Build vLLM ROCm image using the base
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
+
+  
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
    if: build.env("NIGHTLY") == "1"
    agents:
-      queue: cpu_queue_postmerge
+      queue: small_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
@@ -196,3 +272,365 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
      DOCKERHUB_USERNAME: "vllmbot"
+
+  # =============================================================================
+  # ROCm Release Pipeline (x86_64 only)
+  # =============================================================================
+  #
+  # vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
+  # To build a specific version, trigger the build from that branch/tag.
+  #
+  # Environment variables for ROCm builds (set via Buildkite UI or schedule):
+  #   ROCM_PYTHON_VERSION: Python version (default: 3.12)
+  #   PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
+  #   ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
+  #   ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
+  #
+  # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
+  #       (currently rocm/dev-ubuntu-22.04:7.1-complete)
+  #
+  # =============================================================================
+
+  # ROCm Input Step - Collect build configuration (manual trigger only)
+  - input: "ROCm Wheel Release Build Configuration"
+    key: input-rocm-config
+    depends_on: ~
+    if: build.source == "ui"
+    fields:
+      - text: "Python Version"
+        key: "rocm-python-version"
+        default: "3.12"
+        hint: "Python version (e.g., 3.12)"
+      - text: "GPU Architectures"
+        key: "rocm-pytorch-rocm-arch"
+        default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
+        hint: "Semicolon-separated GPU architectures"
+      - select: "Upload Wheels to S3"
+        key: "rocm-upload-wheels"
+        default: "true"
+        options:
+          - label: "No - Build only (nightly/dev)"
+            value: "false"
+          - label: "Yes - Upload to S3 (release)"
+            value: "true"
+      - select: "Force Rebuild Base Wheels"
+        key: "rocm-force-rebuild"
+        default: "false"
+        hint: "Ignore S3 cache and rebuild base wheels from scratch"
+        options:
+          - label: "No - Use cached wheels if available"
+            value: "false"
+          - label: "Yes - Rebuild even if cache exists"
+            value: "true"
+
+  # ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
+  - label: ":rocm: Build ROCm Base Wheels"
+    id: build-rocm-base-wheels
+    depends_on:
+      - step: input-rocm-config
+        allow_failure: true  # Allow failure so non-UI builds can proceed (input step is skipped)
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      # Set configuration and check cache
+      - |
+        set -euo pipefail
+
+        # Get values from meta-data (set by input step) or use defaults
+        PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
+        export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
+
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        # Check for force rebuild flag
+        ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
+        if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
+          ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "ROCm Base Wheels Build Configuration"
+        echo "========================================"
+        echo "  PYTHON_VERSION: $${PYTHON_VERSION}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
+        echo "========================================"
+
+        # Save resolved config for later jobs
+        buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
+        buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
+
+        # Check S3 cache for pre-built wheels
+        CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
+        CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
+        echo ""
+        echo "Cache key: $${CACHE_KEY}"
+        echo "Cache path: $${CACHE_PATH}"
+
+        # Save cache key for downstream jobs
+        buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
+
+        CACHE_STATUS="miss"
+        if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
+          CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
+        else
+          echo "Force rebuild requested, skipping cache check"
+        fi
+
+        if [ "$${CACHE_STATUS}" = "hit" ]; then
+          echo ""
+          echo "CACHE HIT! Downloading pre-built wheels..."
+          echo ""
+          .buildkite/scripts/cache-rocm-base-wheels.sh download
+
+          # Set the S3 path for the cached Docker image (for Job 2 to download)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we used cache (for Docker image handling)
+          buildkite-agent meta-data set "rocm-used-cache" "true"
+
+          echo ""
+          echo "Cache download complete. Skipping Docker build."
+          echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+        else
+          echo ""
+          echo "CACHE MISS. Building from scratch..."
+          echo ""
+
+          # Build full base image (for later vLLM build)
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Build debs_wheel_release stage for wheel extraction
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
+            --target debs_wheel_release \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Extract wheels from Docker image
+          mkdir -p artifacts/rocm-base-wheels
+          container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
+          docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
+          docker rm $${container_id}
+          echo "Extracted base wheels:"
+          ls -lh artifacts/rocm-base-wheels/
+
+          # Upload wheels to S3 cache for future builds
+          echo ""
+          echo "Uploading wheels to S3 cache..."
+          .buildkite/scripts/cache-rocm-base-wheels.sh upload
+
+          # Export base Docker image for reuse in vLLM build
+          mkdir -p artifacts/rocm-docker-image
+          docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
+          echo "Docker image size:"
+          ls -lh artifacts/rocm-docker-image/
+
+          # Upload large Docker image to S3 (also cached by cache key)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
+          aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Save the S3 path for downstream jobs
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we did NOT use cache
+          buildkite-agent meta-data set "rocm-used-cache" "false"
+
+          echo ""
+          echo "Build complete. Wheels cached for future builds."
+        fi
+    artifact_paths:
+      - "artifacts/rocm-base-wheels/*.whl"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 2: Build vLLM ROCm Wheel
+  - label: ":python: Build vLLM ROCm Wheel"
+    id: build-rocm-vllm-wheel
+    depends_on:
+      - step: build-rocm-base-wheels
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 180
+    commands:
+      # Download artifacts and prepare Docker image
+      - |
+        set -euo pipefail
+
+        # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
+        # This fixes version detection when tags are moved/force-pushed
+        echo "Fetching latest tags from origin..."
+        git fetch --tags --force origin
+        
+        # Log tag information for debugging version detection
+        echo "========================================"
+        echo "Git Tag Verification"
+        echo "========================================"
+        echo "Current HEAD: $(git rev-parse HEAD)"
+        echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
+        echo ""
+        echo "Recent tags (pointing to commits near HEAD):"
+        git tag -l --sort=-creatordate | head -5
+        echo "setuptools_scm version detection:"
+        pip install -q setuptools_scm 2>/dev/null || true
+        python3 -c "import setuptools_scm; print('  Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo "  (setuptools_scm not available in this environment)"
+        echo "========================================"
+
+        # Download wheel artifacts from current build
+        echo "Downloading wheel artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+
+        # Download Docker image from S3 (too large for Buildkite artifacts)
+        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+          echo "ERROR: rocm-docker-image-s3-path metadata not found"
+          echo "This should have been set by the build-rocm-base-wheels job"
+          exit 1
+        fi
+        echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
+        mkdir -p artifacts/rocm-docker-image
+        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+        # Load base Docker image and capture the tag
+        echo "Loading base Docker image..."
+        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+        echo "$${LOAD_OUTPUT}"
+        # Extract the actual loaded image tag from "Loaded image: <tag>" output
+        # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
+        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+        if [ -z "$${BASE_IMAGE_TAG}" ]; then
+          echo "ERROR: Failed to extract image tag from docker load output"
+          echo "Load output was: $${LOAD_OUTPUT}"
+          exit 1
+        fi
+        echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+        # Prepare base wheels for Docker build context
+        mkdir -p docker/context/base-wheels
+        touch docker/context/base-wheels/.keep
+        cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
+        echo "Base wheels for vLLM build:"
+        ls -lh docker/context/base-wheels/
+
+        # Get GPU architectures from meta-data
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        echo "========================================"
+        echo "Building vLLM wheel with:"
+        echo "  BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  BASE_IMAGE: $${BASE_IMAGE_TAG}"
+        echo "========================================"
+
+        # Build vLLM wheel using local checkout (REMOTE_VLLM=0)
+        DOCKER_BUILDKIT=1 docker build \
+          --file docker/Dockerfile.rocm \
+          --target export_vllm_wheel_release \
+          --output type=local,dest=rocm-dist \
+          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+          --build-arg REMOTE_VLLM=0 \
+          --build-arg GIT_REPO_CHECK=1 \
+          --build-arg USE_SCCACHE=1 \
+          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+          --build-arg SCCACHE_REGION_NAME=us-west-2 \
+          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+          .
+
+        echo "Built vLLM wheel:"
+        ls -lh rocm-dist/*.whl
+
+        # Copy wheel to artifacts directory
+        mkdir -p artifacts/rocm-vllm-wheel
+        cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
+        echo "Final vLLM wheel:"
+        ls -lh artifacts/rocm-vllm-wheel/
+    artifact_paths:
+      - "artifacts/rocm-vllm-wheel/*.whl"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 3: Upload Wheels to S3
+  - label: ":s3: Upload ROCm Wheels to S3"
+    id: upload-rocm-wheels
+    depends_on:
+      - step: build-rocm-vllm-wheel
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 60
+    commands:
+      # Download all wheel artifacts and run upload
+      - |
+        set -euo pipefail
+
+        # Check if upload is enabled (from env var, meta-data, or release branch)
+        ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
+        if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
+          # Try to get from meta-data (input form)
+          ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "Upload check:"
+        echo "  ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "========================================"
+
+        # Skip upload if not enabled
+        if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
+          echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
+          echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
+          exit 0
+        fi
+
+        echo "Upload enabled, proceeding..."
+
+        # Download artifacts from current build
+        echo "Downloading artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+        buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
+
+        # Run upload script
+        bash .buildkite/scripts/upload-rocm-wheels.sh
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 4: Annotate ROCm Wheel Release
+  - label: ":memo: Annotate ROCm wheel release"
+    id: annotate-rocm-release
+    depends_on:
+      - step: upload-rocm-wheels
+        allow_failure: true
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-rocm-release.sh"
+    env:
+      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -32,6 +32,7 @@ To download and upload the image:
 \`\`\`
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai:latest-rocm
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Generate Buildkite annotation for ROCm wheel release
+
+set -ex
+
+# Get build configuration from meta-data
+# Extract ROCm version dynamically from Dockerfile.rocm_base
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
+ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
+PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
+PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+
+# S3 URLs
+S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
+S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
+ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
+
+buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
+## :rocm: ROCm Wheel Release
+
+### Build Configuration
+| Setting | Value |
+|---------|-------|
+| **ROCm Version** | ${ROCM_VERSION} |
+| **Python Version** | ${PYTHON_VERSION} |
+| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
+| **Branch** | \`${BUILDKITE_BRANCH}\` |
+| **Commit** | \`${BUILDKITE_COMMIT}\` |
+
+### :package: Installation
+
+**Install from this build (by commit):**
+\`\`\`bash
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
+
+# Example:
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
+\`\`\`
+
+**Install from nightly (if published):**
+\`\`\`bash
+uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
+\`\`\`
+
+### :floppy_disk: Download Wheels Directly
+
+\`\`\`bash
+# List all ROCm wheels
+aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
+
+# Download specific wheels
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
+\`\`\`
+
+### :gear: Included Packages
+- **vllm**: vLLM with ROCm support
+- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
+- **triton_rocm**: Triton built for ROCm
+- **torchvision**: TorchVision for ROCm PyTorch
+- **amdsmi**: AMD SMI Python bindings
+
+### :warning: Notes
+- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
+- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
+- Platform: Linux x86_64 only
+EOF
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Cache helper for ROCm base wheels
+#
+# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
+# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
+#
+# Usage:
+#   cache-rocm-base-wheels.sh check    - Check if cache exists, outputs "hit" or "miss"
+#   cache-rocm-base-wheels.sh upload   - Upload wheels to cache
+#   cache-rocm-base-wheels.sh download - Download wheels from cache
+#   cache-rocm-base-wheels.sh key      - Output the cache key
+#
+# Environment variables:
+#   S3_BUCKET          - S3 bucket name (default: vllm-wheels)
+#   PYTHON_VERSION     - Python version (affects cache key)
+#   PYTORCH_ROCM_ARCH  - GPU architectures (affects cache key)
+#
+# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
+#       so changes to ROCm version are captured by the Dockerfile hash.
+
+set -euo pipefail
+
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+DOCKERFILE="docker/Dockerfile.rocm_base"
+CACHE_PREFIX="rocm/cache"
+
+# Generate hash from Dockerfile content + build args
+generate_cache_key() {
+    # Include Dockerfile content
+    if [[ ! -f "$DOCKERFILE" ]]; then
+        echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
+        exit 1
+    fi
+    local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
+
+    # Include key build args that affect the output
+    # These should match the ARGs in Dockerfile.rocm_base that change the build output
+    # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
+    local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
+    local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
+
+    echo "${dockerfile_hash}-${args_hash}"
+}
+
+CACHE_KEY=$(generate_cache_key)
+CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
+
+case "${1:-}" in
+    check)
+        echo "Checking cache for key: ${CACHE_KEY}" >&2
+        echo "Cache path: ${CACHE_PATH}" >&2
+        echo "Variables used in cache key:" >&2
+        echo "  PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
+        echo "  PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
+
+        # Check if cache exists by listing objects
+        # We look for at least one .whl file
+        echo "Running: aws s3 ls ${CACHE_PATH}" >&2
+        S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
+        echo "S3 ls output:" >&2
+        echo "$S3_OUTPUT" | head -5 >&2
+
+        if echo "$S3_OUTPUT" | grep -q "\.whl"; then
+            echo "hit"
+        else
+            echo "miss"
+        fi
+        ;;
+
+    upload)
+        echo "========================================"
+        echo "Uploading wheels to cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
+            echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
+            exit 1
+        fi
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
+            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
+            exit 1
+        fi
+
+        echo "Uploading $WHEEL_COUNT wheels..."
+        aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
+
+        echo ""
+        echo "Cache upload complete!"
+        echo "========================================"
+        ;;
+
+    download)
+        echo "========================================"
+        echo "Downloading wheels from cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        mkdir -p artifacts/rocm-base-wheels
+        aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
+
+        echo ""
+        echo "Downloaded wheels:"
+        ls -lh artifacts/rocm-base-wheels/
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        echo ""
+        echo "Total: $WHEEL_COUNT wheels"
+        echo "========================================"
+        ;;
+
+    key)
+        echo "${CACHE_KEY}"
+        ;;
+
+    path)
+        echo "${CACHE_PATH}"
+        ;;
+
+    *)
+        echo "Usage: $0 {check|upload|download|key|path}" >&2
+        echo "" >&2
+        echo "Commands:" >&2
+        echo "  check    - Check if cache exists, outputs 'hit' or 'miss'" >&2
+        echo "  upload   - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
+        echo "  download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
+        echo "  key      - Output the cache key" >&2
+        echo "  path     - Output the full S3 cache path" >&2
+        exit 1
+        ;;
+esac
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -16,6 +16,18 @@ from urllib.parse import quote

 import regex as re

+
+def normalize_package_name(name: str) -> str:
+    """
+    Normalize package name according to PEP 503.
+    https://peps.python.org/pep-0503/#normalized-names
+
+    Replace runs of underscores, hyphens, and periods with a single hyphen,
+    and lowercase the result.
+    """
+    return re.sub(r"[-_.]+", "-", name).lower()
+
+
 if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")

@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
            version = version.removesuffix("." + variant)
    else:
        if "+" in version:
-            version, variant = version.split("+")
+            version_part, suffix = version.split("+", 1)
+            # Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
+            # Git hashes and other suffixes are NOT variants
+            if suffix.startswith(("rocm", "cu", "cpu")):
+                variant = suffix
+                version = version_part
+            # Otherwise keep the full version string (variant stays None)

    return WheelFileInfo(
        package_name=package_name,
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
        print("No wheel files found, skipping index generation.")
        return

+    # For ROCm builds: inherit variant from vllm wheel
+    # All ROCm wheels should share the same variant as vllm
+    rocm_variant = None
+    for file in parsed_files:
+        if (
+            file.package_name == "vllm"
+            and file.variant
+            and file.variant.startswith("rocm")
+        ):
+            rocm_variant = file.variant
+            print(f"Detected ROCm variant from vllm: {rocm_variant}")
+            break
+
+    # Apply ROCm variant to all wheels without a variant
+    if rocm_variant:
+        for file in parsed_files:
+            if file.variant is None:
+                file.variant = rocm_variant
+                print(f"Inherited variant '{rocm_variant}' for {file.filename}")
+
    # Group by variant
    variant_to_files: dict[str, list[WheelFileInfo]] = {}
    for file in parsed_files:
@@ -256,8 +294,8 @@ def generate_index_and_metadata(

        variant_dir.mkdir(parents=True, exist_ok=True)

-        # gather all package names in this variant
-        packages = set(f.package_name for f in files)
+        # gather all package names in this variant (normalized per PEP 503)
+        packages = set(normalize_package_name(f.package_name) for f in files)
        if variant == "default":
            # these packages should also appear in the "project list"
            # generate after all variants are processed
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
                f.write(project_list_str)

        for package in packages:
-            # filter files belonging to this package only
-            package_files = [f for f in files if f.package_name == package]
+            # filter files belonging to this package only (compare normalized names)
+            package_files = [
+                f for f in files if normalize_package_name(f.package_name) == package
+            ]
            package_dir = variant_dir / package
            package_dir.mkdir(parents=True, exist_ok=True)
            index_str, metadata_str = generate_package_index_and_metadata(
@@ -291,6 +331,7 @@ if __name__ == "__main__":
    """
    Arguments:
        --version <version> : version string for the current build (e.g., commit hash)
+        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
        --output-dir <output_directory> : directory to store generated index files
        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@@ -318,6 +359,12 @@ if __name__ == "__main__":
        required=True,
        help="Directory to store generated index files",
    )
+    parser.add_argument(
+        "--wheel-dir",
+        type=str,
+        default=None,
+        help="Directory containing wheel files (default to be same as `version`)",
+    )
    parser.add_argument(
        "--alias-to-default",
        type=str,
@@ -334,8 +381,13 @@ if __name__ == "__main__":
    args = parser.parse_args()

    version = args.version
-    if "/" in version or "\\" in version:
-        raise ValueError("Version string must not contain slashes.")
+    # Allow rocm/ prefix, reject other slashes and all backslashes
+    if "\\" in version:
+        raise ValueError("Version string must not contain backslashes.")
+    if "/" in version and not version.startswith("rocm/"):
+        raise ValueError(
+            "Version string must not contain slashes (except for 'rocm/' prefix)."
+        )
    current_objects_path = Path(args.current_objects)
    output_dir = Path(args.output_dir)
    if not output_dir.exists():
@@ -372,7 +424,7 @@ if __name__ == "__main__":

    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")

-    # keep only "official" files for a non-nightly version (specifed by cli args)
+    # keep only "official" files for a non-nightly version (specified by cli args)
    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
    if PY_VERSION_RE.match(version):
        # upload-wheels.sh ensures no "dev" is in args.version
@@ -384,9 +436,25 @@ if __name__ == "__main__":
        print("Nightly version detected, keeping all wheel files.")

    # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/{wheel_dir}/<wheel files>
    # s3://vllm-wheels/<anything>/<index files>
-    wheel_base_dir = Path(output_dir).parent / version
+    #
+    # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
+    #   - rocm/{commit}/  (same as wheels)
+    #   - rocm/nightly/
+    #   - rocm/{version}/
+    # All these are under the "rocm/" prefix, so relative paths should be
+    # relative to "rocm/", not the bucket root.
+    if args.wheel_dir:
+        # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
+        wheel_dir = args.wheel_dir.strip().rstrip("/")
+    elif version.startswith("rocm/"):
+        # For rocm/commit, wheel_base_dir should be just the commit part
+        # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
+        wheel_dir = version.split("/", 1)[1]
+    else:
+        wheel_dir = version
+    wheel_base_dir = Path(output_dir).parent / wheel_dir
    index_base_dir = Path(output_dir)

    generate_index_and_metadata(

--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
@@ -210,12 +209,21 @@ if [[ $commands == *"--shard-id="* ]]; then
    wait "${pid}"
    STATUS+=($?)
  done
+  at_least_one_shard_with_tests=0
  for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]]; then
+    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
      echo "One of the processes failed with $st"
      exit "${st}"
+    elif [[ ${st} -eq 5 ]]; then
+      echo "Shard exited with status 5 (no tests collected) - treating as success"
+    else # This means st is 0
+      at_least_one_shard_with_tests=1
    fi
  done
+  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
+    echo "All shards reported no tests collected. Failing the build."
+    exit 1
+  fi
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -50,6 +50,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
    pytest -x -v -s tests/kernels/test_onednn.py"

  # Run basic model test
@@ -83,7 +84,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/lora/test_qwen2vl.py"
+    tests/lora/test_qwenvl.py"

  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"